titan-iac/knowledge/catalog/metrics.json

1881 lines
59 KiB
JSON

[
{
"dashboard": "Atlas GPU",
"panel_title": "Namespace GPU Share",
"panel_id": 1,
"panel_type": "piechart",
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
"tags": [
"atlas",
"gpu"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))"
]
},
{
"dashboard": "Atlas GPU",
"panel_title": "GPU Util by Namespace",
"panel_id": 2,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"gpu"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)"
]
},
{
"dashboard": "Atlas GPU",
"panel_title": "GPU Util by Node",
"panel_id": 3,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"gpu"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})"
]
},
{
"dashboard": "Atlas GPU",
"panel_title": "Top Pods by GPU Util",
"panel_id": 4,
"panel_type": "table",
"description": "",
"tags": [
"atlas",
"gpu"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Ariadne Task Errors (range)",
"panel_id": 1,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Ariadne Attempts / Failures",
"panel_id": 2,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(increase(ariadne_task_runs_total[$__interval]))",
"sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "One-off Job Pods (age hours)",
"panel_id": 3,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Glue Jobs Stale (>36h)",
"panel_id": 4,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Glue Jobs Missing Success",
"panel_id": 5,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Glue Jobs Suspended",
"panel_id": 6,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Ariadne Task Errors (1h)",
"panel_id": 7,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Ariadne Task Errors (24h)",
"panel_id": 8,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Ariadne Task Runs (1h)",
"panel_id": 9,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(increase(ariadne_task_runs_total[1h]))"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Ariadne Schedule Last Error (hours ago)",
"panel_id": 10,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Ariadne Schedule Last Success (hours ago)",
"panel_id": 11,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Glue Jobs Last Success (hours ago)",
"panel_id": 12,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Glue Jobs Last Schedule (hours ago)",
"panel_id": 13,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Ariadne Task Errors (1h)",
"panel_id": 14,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Ariadne Task Errors (30d)",
"panel_id": 15,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Ariadne Access Requests",
"panel_id": 16,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc(ariadne_access_requests_total)"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Ariadne CI Coverage (%)",
"panel_id": 17,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"ariadne_ci_coverage_percent{repo=\"ariadne\"}"
]
},
{
"dashboard": "Atlas Jobs",
"panel_title": "Ariadne CI Tests (latest)",
"panel_id": 18,
"panel_type": "table",
"description": "",
"tags": [
"atlas",
"jobs",
"glue"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"ariadne_ci_tests_total{repo=\"ariadne\"}"
]
},
{
"dashboard": "Atlas Mail",
"panel_title": "Sent (1d)",
"panel_id": 1,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"mail"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"max(postmark_outbound_sent{window=\"1d\"})"
]
},
{
"dashboard": "Atlas Mail",
"panel_title": "Sent (7d)",
"panel_id": 2,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"mail"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"max(postmark_outbound_sent{window=\"7d\"})"
]
},
{
"dashboard": "Atlas Mail",
"panel_title": "Mail Bounces (1d)",
"panel_id": 3,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"mail"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"max(postmark_outbound_bounce_rate{window=\"1d\"})",
"max(postmark_outbound_bounced{window=\"1d\"})"
]
},
{
"dashboard": "Atlas Mail",
"panel_title": "Success Rate (1d)",
"panel_id": 4,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"mail"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)"
]
},
{
"dashboard": "Atlas Mail",
"panel_title": "Limit Used (30d)",
"panel_id": 5,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"mail"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"max(postmark_sending_limit_used_percent)"
]
},
{
"dashboard": "Atlas Mail",
"panel_title": "Send Limit (30d)",
"panel_id": 6,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"mail"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"max(postmark_sending_limit)"
]
},
{
"dashboard": "Atlas Mail",
"panel_title": "Last Success",
"panel_id": 7,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"mail"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"max(postmark_last_success_timestamp_seconds)"
]
},
{
"dashboard": "Atlas Mail",
"panel_title": "Exporter Errors",
"panel_id": 8,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"mail"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(postmark_request_errors_total)"
]
},
{
"dashboard": "Atlas Mail",
"panel_title": "Bounce Rate (1d vs 7d)",
"panel_id": 13,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"mail"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"max by (window) (postmark_outbound_bounce_rate)"
]
},
{
"dashboard": "Atlas Mail",
"panel_title": "Bounced (1d vs 7d)",
"panel_id": 14,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"mail"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"max by (window) (postmark_outbound_bounced)"
]
},
{
"dashboard": "Atlas Mail",
"panel_title": "Sent (1d vs 7d)",
"panel_id": 15,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"mail"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"max by (window) (postmark_outbound_sent)"
]
},
{
"dashboard": "Atlas Mail",
"panel_title": "Exporter Errors",
"panel_id": 16,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"mail"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(postmark_request_errors_total)"
]
},
{
"dashboard": "Atlas Network",
"panel_title": "Ingress Success Rate (5m)",
"panel_id": 1,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"network"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)"
]
},
{
"dashboard": "Atlas Network",
"panel_title": "Error Budget Burn (1h)",
"panel_id": 2,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"network"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009"
]
},
{
"dashboard": "Atlas Network",
"panel_title": "Error Budget Burn (6h)",
"panel_id": 3,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"network"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009"
]
},
{
"dashboard": "Atlas Network",
"panel_title": "Edge P99 Latency (ms)",
"panel_id": 4,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"network"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
]
},
{
"dashboard": "Atlas Network",
"panel_title": "Ingress Traffic",
"panel_id": 5,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"network"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
]
},
{
"dashboard": "Atlas Network",
"panel_title": "Egress Traffic",
"panel_id": 6,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"network"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
]
},
{
"dashboard": "Atlas Network",
"panel_title": "Intra-Cluster Traffic",
"panel_id": 7,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"network"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)"
]
},
{
"dashboard": "Atlas Network",
"panel_title": "Per-Node Throughput",
"panel_id": 8,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"network"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
]
},
{
"dashboard": "Atlas Network",
"panel_title": "Top Namespaces",
"panel_id": 9,
"panel_type": "table",
"description": "",
"tags": [
"atlas",
"network"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))"
]
},
{
"dashboard": "Atlas Network",
"panel_title": "Top Pods",
"panel_id": 10,
"panel_type": "table",
"description": "",
"tags": [
"atlas",
"network"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))"
]
},
{
"dashboard": "Atlas Network",
"panel_title": "Traefik Routers (req/s)",
"panel_id": 11,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"network"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))"
]
},
{
"dashboard": "Atlas Network",
"panel_title": "Traefik Entrypoints (req/s)",
"panel_id": 12,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"network"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))"
]
},
{
"dashboard": "Atlas Nodes",
"panel_title": "Worker Nodes Ready",
"panel_id": 1,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"nodes"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})"
]
},
{
"dashboard": "Atlas Nodes",
"panel_title": "Control Plane Ready",
"panel_id": 2,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"nodes"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})"
]
},
{
"dashboard": "Atlas Nodes",
"panel_title": "Control Plane Workloads",
"panel_id": 3,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"nodes"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})"
]
},
{
"dashboard": "Atlas Nodes",
"panel_title": "API Server 5xx rate",
"panel_id": 9,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"nodes"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))"
]
},
{
"dashboard": "Atlas Nodes",
"panel_title": "API Server P99 latency",
"panel_id": 10,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"nodes"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
]
},
{
"dashboard": "Atlas Nodes",
"panel_title": "etcd P99 latency",
"panel_id": 11,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"nodes"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
]
},
{
"dashboard": "Atlas Nodes",
"panel_title": "Node CPU",
"panel_id": 4,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"nodes"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
]
},
{
"dashboard": "Atlas Nodes",
"panel_title": "Node RAM",
"panel_id": 5,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"nodes"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
]
},
{
"dashboard": "Atlas Nodes",
"panel_title": "Control Plane (incl. titan-db) CPU",
"panel_id": 6,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"nodes"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
]
},
{
"dashboard": "Atlas Nodes",
"panel_title": "Control Plane (incl. titan-db) RAM",
"panel_id": 7,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"nodes"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
]
},
{
"dashboard": "Atlas Nodes",
"panel_title": "Root Filesystem Usage",
"panel_id": 8,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"nodes"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Control Plane Ready",
"panel_id": 2,
"panel_type": "gauge",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Control Plane Workloads",
"panel_id": 3,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Stuck Terminating",
"panel_id": 5,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Atlas Availability",
"panel_id": 27,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Problem Pods",
"panel_id": 4,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "CrashLoop / ImagePull",
"panel_id": 6,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Workers Ready",
"panel_id": 1,
"panel_type": "gauge",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Hottest node: CPU",
"panel_id": 7,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Hottest node: RAM",
"panel_id": 8,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Hottest node: NET (rx+tx)",
"panel_id": 9,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Hottest node: I/O (r+w)",
"panel_id": 10,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Mail Sent (1d)",
"panel_id": 30,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"max(postmark_outbound_sent{window=\"1d\"})"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Mail Bounces (1d)",
"panel_id": 31,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"max(postmark_outbound_bounce_rate{window=\"1d\"})",
"max(postmark_outbound_bounced{window=\"1d\"})"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Mail Success Rate (1d)",
"panel_id": 32,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Mail Limit Used (30d)",
"panel_id": 33,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"max(postmark_sending_limit_used_percent)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Postgres Connections Used",
"panel_id": 34,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Postgres Hottest Connections",
"panel_id": 35,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"topk(1, sum by (datname) (pg_stat_activity_count))"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Astreae Usage",
"panel_id": 23,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Asteria Usage",
"panel_id": 24,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Astreae Free",
"panel_id": 25,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Asteria Free",
"panel_id": 26,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "One-off Job Pods (age hours)",
"panel_id": 40,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Ariadne Attempts / Failures",
"panel_id": 41,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(increase(ariadne_task_runs_total[$__interval]))",
"sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Ariadne Test Success Rate",
"panel_id": 42,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Tests with Failures (24h)",
"panel_id": 43,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Namespace CPU Share",
"panel_id": 11,
"panel_type": "piechart",
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Namespace GPU Share",
"panel_id": 12,
"panel_type": "piechart",
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Namespace RAM Share",
"panel_id": 13,
"panel_type": "piechart",
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Worker Node CPU",
"panel_id": 14,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Worker Node RAM",
"panel_id": 15,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Control plane CPU",
"panel_id": 16,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Control plane RAM",
"panel_id": 17,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Node Pod Share",
"panel_id": 28,
"panel_type": "piechart",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Top Nodes by Pod Count",
"panel_id": 29,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Cluster Ingress Throughput",
"panel_id": 18,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Cluster Egress Throughput",
"panel_id": 19,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Intra-Cluster Throughput",
"panel_id": 20,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Root Filesystem Usage",
"panel_id": 21,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
]
},
{
"dashboard": "Atlas Overview",
"panel_title": "Nodes Closest to Full Root Disks",
"panel_id": 22,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"overview"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))"
]
},
{
"dashboard": "Atlas Pods",
"panel_title": "Problem Pods",
"panel_id": 1,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"pods"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)"
]
},
{
"dashboard": "Atlas Pods",
"panel_title": "CrashLoop / ImagePull",
"panel_id": 2,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"pods"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)"
]
},
{
"dashboard": "Atlas Pods",
"panel_title": "Stuck Terminating (>10m)",
"panel_id": 3,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"pods"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)"
]
},
{
"dashboard": "Atlas Pods",
"panel_title": "Control Plane Workloads",
"panel_id": 4,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"pods"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})"
]
},
{
"dashboard": "Atlas Pods",
"panel_title": "Pods Not Running",
"panel_id": 5,
"panel_type": "table",
"description": "",
"tags": [
"atlas",
"pods"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
]
},
{
"dashboard": "Atlas Pods",
"panel_title": "CrashLoop / ImagePull",
"panel_id": 6,
"panel_type": "table",
"description": "",
"tags": [
"atlas",
"pods"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
]
},
{
"dashboard": "Atlas Pods",
"panel_title": "Terminating >10m",
"panel_id": 7,
"panel_type": "table",
"description": "",
"tags": [
"atlas",
"pods"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)"
]
},
{
"dashboard": "Atlas Pods",
"panel_title": "Node Pod Share",
"panel_id": 8,
"panel_type": "piechart",
"description": "",
"tags": [
"atlas",
"pods"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100"
]
},
{
"dashboard": "Atlas Pods",
"panel_title": "Top Nodes by Pod Count",
"panel_id": 9,
"panel_type": "bargauge",
"description": "",
"tags": [
"atlas",
"pods"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))"
]
},
{
"dashboard": "Atlas Pods",
"panel_title": "Namespace Plurality by Node v27",
"panel_id": 10,
"panel_type": "table",
"description": "",
"tags": [
"atlas",
"pods"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))"
]
},
{
"dashboard": "Atlas Storage",
"panel_title": "Astreae Usage",
"panel_id": 1,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"storage"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
]
},
{
"dashboard": "Atlas Storage",
"panel_title": "Asteria Usage",
"panel_id": 2,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"storage"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
]
},
{
"dashboard": "Atlas Storage",
"panel_title": "Astreae Free",
"panel_id": 3,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"storage"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})"
]
},
{
"dashboard": "Atlas Storage",
"panel_title": "Asteria Free",
"panel_id": 4,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"storage"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})"
]
},
{
"dashboard": "Atlas Storage",
"panel_title": "Astreae Per-Node Usage",
"panel_id": 5,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"storage"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
]
},
{
"dashboard": "Atlas Storage",
"panel_title": "Asteria Per-Node Usage",
"panel_id": 6,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"storage"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
]
},
{
"dashboard": "Atlas Storage",
"panel_title": "Astreae Usage History",
"panel_id": 7,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"storage"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
]
},
{
"dashboard": "Atlas Storage",
"panel_title": "Asteria Usage History",
"panel_id": 8,
"panel_type": "timeseries",
"description": "",
"tags": [
"atlas",
"storage"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
]
},
{
"dashboard": "Atlas Storage",
"panel_title": "Maintenance Sweepers Ready",
"panel_id": 30,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"storage"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100"
]
},
{
"dashboard": "Atlas Storage",
"panel_title": "Maintenance Cron Freshness (s)",
"panel_id": 31,
"panel_type": "stat",
"description": "",
"tags": [
"atlas",
"storage"
],
"datasource_uid": "atlas-vm",
"datasource_type": "prometheus",
"exprs": [
"time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})"
]
}
]