[ { "dashboard": "Atlas GPU", "panel_title": "Namespace GPU Share", "panel_id": 1, "panel_type": "piechart", "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", "tags": [ "atlas", "gpu" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" ] }, { "dashboard": "Atlas GPU", "panel_title": "GPU Util by Namespace", "panel_id": 2, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "gpu" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)" ] }, { "dashboard": "Atlas GPU", "panel_title": "GPU Util by Node", "panel_id": 3, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "gpu" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})" ] }, { "dashboard": "Atlas GPU", "panel_title": "Top Pods by GPU Util", "panel_id": 4, "panel_type": "table", "description": "", "tags": [ "atlas", "gpu" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Ariadne Task Errors (range)", "panel_id": 1, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Ariadne Attempts / Failures", "panel_id": 2, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(increase(ariadne_task_runs_total[$__interval]))", "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" ] }, { "dashboard": "Atlas Jobs", "panel_title": "One-off Job Pods (age hours)", "panel_id": 3, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Glue Jobs Stale (>36h)", "panel_id": 4, "panel_type": "stat", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Glue Jobs Missing Success", "panel_id": 5, "panel_type": "stat", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Glue Jobs Suspended", "panel_id": 6, "panel_type": "stat", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Ariadne Task Errors (1h)", "panel_id": 7, "panel_type": "stat", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Ariadne Task Errors (24h)", "panel_id": 8, "panel_type": "stat", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Ariadne Task Runs (1h)", "panel_id": 9, "panel_type": "stat", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(increase(ariadne_task_runs_total[1h]))" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Ariadne Schedule Last Error (hours ago)", "panel_id": 10, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Ariadne Schedule Last Success (hours ago)", "panel_id": 11, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Glue Jobs Last Success (hours ago)", "panel_id": 12, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Glue Jobs Last Schedule (hours ago)", "panel_id": 13, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Ariadne Task Errors (1h)", "panel_id": 14, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Ariadne Task Errors (30d)", "panel_id": 15, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Ariadne Access Requests", "panel_id": 16, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc(ariadne_access_requests_total)" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Ariadne CI Coverage (%)", "panel_id": 17, "panel_type": "stat", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "ariadne_ci_coverage_percent{repo=\"ariadne\"}" ] }, { "dashboard": "Atlas Jobs", "panel_title": "Ariadne CI Tests (latest)", "panel_id": 18, "panel_type": "table", "description": "", "tags": [ "atlas", "jobs", "glue" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "ariadne_ci_tests_total{repo=\"ariadne\"}" ] }, { "dashboard": "Atlas Mail", "panel_title": "Sent (1d)", "panel_id": 1, "panel_type": "stat", "description": "", "tags": [ "atlas", "mail" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "max(postmark_outbound_sent{window=\"1d\"})" ] }, { "dashboard": "Atlas Mail", "panel_title": "Sent (7d)", "panel_id": 2, "panel_type": "stat", "description": "", "tags": [ "atlas", "mail" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "max(postmark_outbound_sent{window=\"7d\"})" ] }, { "dashboard": "Atlas Mail", "panel_title": "Mail Bounces (1d)", "panel_id": 3, "panel_type": "stat", "description": "", "tags": [ "atlas", "mail" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "max(postmark_outbound_bounce_rate{window=\"1d\"})", "max(postmark_outbound_bounced{window=\"1d\"})" ] }, { "dashboard": "Atlas Mail", "panel_title": "Success Rate (1d)", "panel_id": 4, "panel_type": "stat", "description": "", "tags": [ "atlas", "mail" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" ] }, { "dashboard": "Atlas Mail", "panel_title": "Limit Used (30d)", "panel_id": 5, "panel_type": "stat", "description": "", "tags": [ "atlas", "mail" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "max(postmark_sending_limit_used_percent)" ] }, { "dashboard": "Atlas Mail", "panel_title": "Send Limit (30d)", "panel_id": 6, "panel_type": "stat", "description": "", "tags": [ "atlas", "mail" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "max(postmark_sending_limit)" ] }, { "dashboard": "Atlas Mail", "panel_title": "Last Success", "panel_id": 7, "panel_type": "stat", "description": "", "tags": [ "atlas", "mail" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "max(postmark_last_success_timestamp_seconds)" ] }, { "dashboard": "Atlas Mail", "panel_title": "Exporter Errors", "panel_id": 8, "panel_type": "stat", "description": "", "tags": [ "atlas", "mail" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(postmark_request_errors_total)" ] }, { "dashboard": "Atlas Mail", "panel_title": "Bounce Rate (1d vs 7d)", "panel_id": 13, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "mail" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "max by (window) (postmark_outbound_bounce_rate)" ] }, { "dashboard": "Atlas Mail", "panel_title": "Bounced (1d vs 7d)", "panel_id": 14, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "mail" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "max by (window) (postmark_outbound_bounced)" ] }, { "dashboard": "Atlas Mail", "panel_title": "Sent (1d vs 7d)", "panel_id": 15, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "mail" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "max by (window) (postmark_outbound_sent)" ] }, { "dashboard": "Atlas Mail", "panel_title": "Exporter Errors", "panel_id": 16, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "mail" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(postmark_request_errors_total)" ] }, { "dashboard": "Atlas Network", "panel_title": "Ingress Success Rate (5m)", "panel_id": 1, "panel_type": "stat", "description": "", "tags": [ "atlas", "network" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)" ] }, { "dashboard": "Atlas Network", "panel_title": "Error Budget Burn (1h)", "panel_id": 2, "panel_type": "stat", "description": "", "tags": [ "atlas", "network" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009" ] }, { "dashboard": "Atlas Network", "panel_title": "Error Budget Burn (6h)", "panel_id": 3, "panel_type": "stat", "description": "", "tags": [ "atlas", "network" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009" ] }, { "dashboard": "Atlas Network", "panel_title": "Edge P99 Latency (ms)", "panel_id": 4, "panel_type": "stat", "description": "", "tags": [ "atlas", "network" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" ] }, { "dashboard": "Atlas Network", "panel_title": "Ingress Traffic", "panel_id": 5, "panel_type": "stat", "description": "", "tags": [ "atlas", "network" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" ] }, { "dashboard": "Atlas Network", "panel_title": "Egress Traffic", "panel_id": 6, "panel_type": "stat", "description": "", "tags": [ "atlas", "network" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" ] }, { "dashboard": "Atlas Network", "panel_title": "Intra-Cluster Traffic", "panel_id": 7, "panel_type": "stat", "description": "", "tags": [ "atlas", "network" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" ] }, { "dashboard": "Atlas Network", "panel_title": "Per-Node Throughput", "panel_id": 8, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "network" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" ] }, { "dashboard": "Atlas Network", "panel_title": "Top Namespaces", "panel_id": 9, "panel_type": "table", "description": "", "tags": [ "atlas", "network" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))" ] }, { "dashboard": "Atlas Network", "panel_title": "Top Pods", "panel_id": 10, "panel_type": "table", "description": "", "tags": [ "atlas", "network" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))" ] }, { "dashboard": "Atlas Network", "panel_title": "Traefik Routers (req/s)", "panel_id": 11, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "network" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))" ] }, { "dashboard": "Atlas Network", "panel_title": "Traefik Entrypoints (req/s)", "panel_id": 12, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "network" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))" ] }, { "dashboard": "Atlas Nodes", "panel_title": "Worker Nodes Ready", "panel_id": 1, "panel_type": "stat", "description": "", "tags": [ "atlas", "nodes" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" ] }, { "dashboard": "Atlas Nodes", "panel_title": "Control Plane Ready", "panel_id": 2, "panel_type": "stat", "description": "", "tags": [ "atlas", "nodes" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" ] }, { "dashboard": "Atlas Nodes", "panel_title": "Control Plane Workloads", "panel_id": 3, "panel_type": "stat", "description": "", "tags": [ "atlas", "nodes" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" ] }, { "dashboard": "Atlas Nodes", "panel_title": "API Server 5xx rate", "panel_id": 9, "panel_type": "stat", "description": "", "tags": [ "atlas", "nodes" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))" ] }, { "dashboard": "Atlas Nodes", "panel_title": "API Server P99 latency", "panel_id": 10, "panel_type": "stat", "description": "", "tags": [ "atlas", "nodes" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" ] }, { "dashboard": "Atlas Nodes", "panel_title": "etcd P99 latency", "panel_id": 11, "panel_type": "stat", "description": "", "tags": [ "atlas", "nodes" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" ] }, { "dashboard": "Atlas Nodes", "panel_title": "Node CPU", "panel_id": 4, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "nodes" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" ] }, { "dashboard": "Atlas Nodes", "panel_title": "Node RAM", "panel_id": 5, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "nodes" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" ] }, { "dashboard": "Atlas Nodes", "panel_title": "Control Plane (incl. titan-db) CPU", "panel_id": 6, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "nodes" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" ] }, { "dashboard": "Atlas Nodes", "panel_title": "Control Plane (incl. titan-db) RAM", "panel_id": 7, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "nodes" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" ] }, { "dashboard": "Atlas Nodes", "panel_title": "Root Filesystem Usage", "panel_id": 8, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "nodes" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" ] }, { "dashboard": "Atlas Overview", "panel_title": "Control Plane Ready", "panel_id": 2, "panel_type": "gauge", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" ] }, { "dashboard": "Atlas Overview", "panel_title": "Control Plane Workloads", "panel_id": 3, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Stuck Terminating", "panel_id": 5, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Atlas Availability", "panel_id": 27, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])" ] }, { "dashboard": "Atlas Overview", "panel_title": "Problem Pods", "panel_id": 4, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" ] }, { "dashboard": "Atlas Overview", "panel_title": "CrashLoop / ImagePull", "panel_id": 6, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Workers Ready", "panel_id": 1, "panel_type": "gauge", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" ] }, { "dashboard": "Atlas Overview", "panel_title": "Hottest node: CPU", "panel_id": 7, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" ] }, { "dashboard": "Atlas Overview", "panel_title": "Hottest node: RAM", "panel_id": 8, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" ] }, { "dashboard": "Atlas Overview", "panel_title": "Hottest node: NET (rx+tx)", "panel_id": 9, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" ] }, { "dashboard": "Atlas Overview", "panel_title": "Hottest node: I/O (r+w)", "panel_id": 10, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" ] }, { "dashboard": "Atlas Overview", "panel_title": "Mail Sent (1d)", "panel_id": 30, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "max(postmark_outbound_sent{window=\"1d\"})" ] }, { "dashboard": "Atlas Overview", "panel_title": "Mail Bounces (1d)", "panel_id": 31, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "max(postmark_outbound_bounce_rate{window=\"1d\"})", "max(postmark_outbound_bounced{window=\"1d\"})" ] }, { "dashboard": "Atlas Overview", "panel_title": "Mail Success Rate (1d)", "panel_id": 32, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Mail Limit Used (30d)", "panel_id": 33, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "max(postmark_sending_limit_used_percent)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Postgres Connections Used", "panel_id": 34, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")" ] }, { "dashboard": "Atlas Overview", "panel_title": "Postgres Hottest Connections", "panel_id": 35, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "topk(1, sum by (datname) (pg_stat_activity_count))" ] }, { "dashboard": "Atlas Overview", "panel_title": "Astreae Usage", "panel_id": 23, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Asteria Usage", "panel_id": 24, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Astreae Free", "panel_id": 25, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" ] }, { "dashboard": "Atlas Overview", "panel_title": "Asteria Free", "panel_id": 26, "panel_type": "stat", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" ] }, { "dashboard": "Atlas Overview", "panel_title": "One-off Job Pods (age hours)", "panel_id": 40, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" ] }, { "dashboard": "Atlas Overview", "panel_title": "Ariadne Attempts / Failures", "panel_id": 41, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(increase(ariadne_task_runs_total[$__interval]))", "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" ] }, { "dashboard": "Atlas Overview", "panel_title": "Ariadne Test Success Rate", "panel_id": 42, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Tests with Failures (24h)", "panel_id": 43, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))" ] }, { "dashboard": "Atlas Overview", "panel_title": "Namespace CPU Share", "panel_id": 11, "panel_type": "piechart", "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Namespace GPU Share", "panel_id": 12, "panel_type": "piechart", "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" ] }, { "dashboard": "Atlas Overview", "panel_title": "Namespace RAM Share", "panel_id": 13, "panel_type": "piechart", "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Worker Node CPU", "panel_id": 14, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" ] }, { "dashboard": "Atlas Overview", "panel_title": "Worker Node RAM", "panel_id": 15, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" ] }, { "dashboard": "Atlas Overview", "panel_title": "Control plane CPU", "panel_id": 16, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" ] }, { "dashboard": "Atlas Overview", "panel_title": "Control plane RAM", "panel_id": 17, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" ] }, { "dashboard": "Atlas Overview", "panel_title": "Node Pod Share", "panel_id": 28, "panel_type": "piechart", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" ] }, { "dashboard": "Atlas Overview", "panel_title": "Top Nodes by Pod Count", "panel_id": 29, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" ] }, { "dashboard": "Atlas Overview", "panel_title": "Cluster Ingress Throughput", "panel_id": 18, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Cluster Egress Throughput", "panel_id": 19, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Intra-Cluster Throughput", "panel_id": 20, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" ] }, { "dashboard": "Atlas Overview", "panel_title": "Root Filesystem Usage", "panel_id": 21, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" ] }, { "dashboard": "Atlas Overview", "panel_title": "Nodes Closest to Full Root Disks", "panel_id": 22, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "overview" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))" ] }, { "dashboard": "Atlas Pods", "panel_title": "Problem Pods", "panel_id": 1, "panel_type": "stat", "description": "", "tags": [ "atlas", "pods" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" ] }, { "dashboard": "Atlas Pods", "panel_title": "CrashLoop / ImagePull", "panel_id": 2, "panel_type": "stat", "description": "", "tags": [ "atlas", "pods" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" ] }, { "dashboard": "Atlas Pods", "panel_title": "Stuck Terminating (>10m)", "panel_id": 3, "panel_type": "stat", "description": "", "tags": [ "atlas", "pods" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" ] }, { "dashboard": "Atlas Pods", "panel_title": "Control Plane Workloads", "panel_id": 4, "panel_type": "stat", "description": "", "tags": [ "atlas", "pods" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" ] }, { "dashboard": "Atlas Pods", "panel_title": "Pods Not Running", "panel_id": 5, "panel_type": "table", "description": "", "tags": [ "atlas", "pods" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" ] }, { "dashboard": "Atlas Pods", "panel_title": "CrashLoop / ImagePull", "panel_id": 6, "panel_type": "table", "description": "", "tags": [ "atlas", "pods" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" ] }, { "dashboard": "Atlas Pods", "panel_title": "Terminating >10m", "panel_id": 7, "panel_type": "table", "description": "", "tags": [ "atlas", "pods" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)" ] }, { "dashboard": "Atlas Pods", "panel_title": "Node Pod Share", "panel_id": 8, "panel_type": "piechart", "description": "", "tags": [ "atlas", "pods" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" ] }, { "dashboard": "Atlas Pods", "panel_title": "Top Nodes by Pod Count", "panel_id": 9, "panel_type": "bargauge", "description": "", "tags": [ "atlas", "pods" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" ] }, { "dashboard": "Atlas Pods", "panel_title": "Namespace Plurality by Node v27", "panel_id": 10, "panel_type": "table", "description": "", "tags": [ "atlas", "pods" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))" ] }, { "dashboard": "Atlas Storage", "panel_title": "Astreae Usage", "panel_id": 1, "panel_type": "stat", "description": "", "tags": [ "atlas", "storage" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" ] }, { "dashboard": "Atlas Storage", "panel_title": "Asteria Usage", "panel_id": 2, "panel_type": "stat", "description": "", "tags": [ "atlas", "storage" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" ] }, { "dashboard": "Atlas Storage", "panel_title": "Astreae Free", "panel_id": 3, "panel_type": "stat", "description": "", "tags": [ "atlas", "storage" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" ] }, { "dashboard": "Atlas Storage", "panel_title": "Asteria Free", "panel_id": 4, "panel_type": "stat", "description": "", "tags": [ "atlas", "storage" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" ] }, { "dashboard": "Atlas Storage", "panel_title": "Astreae Per-Node Usage", "panel_id": 5, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "storage" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" ] }, { "dashboard": "Atlas Storage", "panel_title": "Asteria Per-Node Usage", "panel_id": 6, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "storage" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" ] }, { "dashboard": "Atlas Storage", "panel_title": "Astreae Usage History", "panel_id": 7, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "storage" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" ] }, { "dashboard": "Atlas Storage", "panel_title": "Asteria Usage History", "panel_id": 8, "panel_type": "timeseries", "description": "", "tags": [ "atlas", "storage" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" ] }, { "dashboard": "Atlas Storage", "panel_title": "Maintenance Sweepers Ready", "panel_id": 30, "panel_type": "stat", "description": "", "tags": [ "atlas", "storage" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100" ] }, { "dashboard": "Atlas Storage", "panel_title": "Maintenance Cron Freshness (s)", "panel_id": 31, "panel_type": "stat", "description": "", "tags": [ "atlas", "storage" ], "datasource_uid": "atlas-vm", "datasource_type": "prometheus", "exprs": [ "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})" ] } ]