1881 lines
59 KiB
JSON
1881 lines
59 KiB
JSON
[
|
|
{
|
|
"dashboard": "Atlas GPU",
|
|
"panel_title": "Namespace GPU Share",
|
|
"panel_id": 1,
|
|
"panel_type": "piechart",
|
|
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
|
"tags": [
|
|
"atlas",
|
|
"gpu"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas GPU",
|
|
"panel_title": "GPU Util by Namespace",
|
|
"panel_id": 2,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"gpu"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas GPU",
|
|
"panel_title": "GPU Util by Node",
|
|
"panel_id": 3,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"gpu"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas GPU",
|
|
"panel_title": "Top Pods by GPU Util",
|
|
"panel_id": 4,
|
|
"panel_type": "table",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"gpu"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Ariadne Task Errors (range)",
|
|
"panel_id": 1,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Ariadne Attempts / Failures",
|
|
"panel_id": 2,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(increase(ariadne_task_runs_total[$__interval]))",
|
|
"sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "One-off Job Pods (age hours)",
|
|
"panel_id": 3,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Glue Jobs Stale (>36h)",
|
|
"panel_id": 4,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Glue Jobs Missing Success",
|
|
"panel_id": 5,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Glue Jobs Suspended",
|
|
"panel_id": 6,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Ariadne Task Errors (1h)",
|
|
"panel_id": 7,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Ariadne Task Errors (24h)",
|
|
"panel_id": 8,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Ariadne Task Runs (1h)",
|
|
"panel_id": 9,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(increase(ariadne_task_runs_total[1h]))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Ariadne Schedule Last Error (hours ago)",
|
|
"panel_id": 10,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Ariadne Schedule Last Success (hours ago)",
|
|
"panel_id": 11,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Glue Jobs Last Success (hours ago)",
|
|
"panel_id": 12,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Glue Jobs Last Schedule (hours ago)",
|
|
"panel_id": 13,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Ariadne Task Errors (1h)",
|
|
"panel_id": 14,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Ariadne Task Errors (30d)",
|
|
"panel_id": 15,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Ariadne Access Requests",
|
|
"panel_id": 16,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc(ariadne_access_requests_total)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Ariadne CI Coverage (%)",
|
|
"panel_id": 17,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"ariadne_ci_coverage_percent{repo=\"ariadne\"}"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Jobs",
|
|
"panel_title": "Ariadne CI Tests (latest)",
|
|
"panel_id": 18,
|
|
"panel_type": "table",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"jobs",
|
|
"glue"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"ariadne_ci_tests_total{repo=\"ariadne\"}"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Mail",
|
|
"panel_title": "Sent (1d)",
|
|
"panel_id": 1,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"mail"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"max(postmark_outbound_sent{window=\"1d\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Mail",
|
|
"panel_title": "Sent (7d)",
|
|
"panel_id": 2,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"mail"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"max(postmark_outbound_sent{window=\"7d\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Mail",
|
|
"panel_title": "Mail Bounces (1d)",
|
|
"panel_id": 3,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"mail"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"max(postmark_outbound_bounce_rate{window=\"1d\"})",
|
|
"max(postmark_outbound_bounced{window=\"1d\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Mail",
|
|
"panel_title": "Success Rate (1d)",
|
|
"panel_id": 4,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"mail"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Mail",
|
|
"panel_title": "Limit Used (30d)",
|
|
"panel_id": 5,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"mail"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"max(postmark_sending_limit_used_percent)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Mail",
|
|
"panel_title": "Send Limit (30d)",
|
|
"panel_id": 6,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"mail"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"max(postmark_sending_limit)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Mail",
|
|
"panel_title": "Last Success",
|
|
"panel_id": 7,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"mail"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"max(postmark_last_success_timestamp_seconds)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Mail",
|
|
"panel_title": "Exporter Errors",
|
|
"panel_id": 8,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"mail"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(postmark_request_errors_total)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Mail",
|
|
"panel_title": "Bounce Rate (1d vs 7d)",
|
|
"panel_id": 13,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"mail"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"max by (window) (postmark_outbound_bounce_rate)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Mail",
|
|
"panel_title": "Bounced (1d vs 7d)",
|
|
"panel_id": 14,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"mail"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"max by (window) (postmark_outbound_bounced)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Mail",
|
|
"panel_title": "Sent (1d vs 7d)",
|
|
"panel_id": 15,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"mail"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"max by (window) (postmark_outbound_sent)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Mail",
|
|
"panel_title": "Exporter Errors",
|
|
"panel_id": 16,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"mail"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(postmark_request_errors_total)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Network",
|
|
"panel_title": "Ingress Success Rate (5m)",
|
|
"panel_id": 1,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"network"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Network",
|
|
"panel_title": "Error Budget Burn (1h)",
|
|
"panel_id": 2,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"network"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Network",
|
|
"panel_title": "Error Budget Burn (6h)",
|
|
"panel_id": 3,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"network"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Network",
|
|
"panel_title": "Edge P99 Latency (ms)",
|
|
"panel_id": 4,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"network"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Network",
|
|
"panel_title": "Ingress Traffic",
|
|
"panel_id": 5,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"network"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Network",
|
|
"panel_title": "Egress Traffic",
|
|
"panel_id": 6,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"network"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Network",
|
|
"panel_title": "Intra-Cluster Traffic",
|
|
"panel_id": 7,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"network"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Network",
|
|
"panel_title": "Per-Node Throughput",
|
|
"panel_id": 8,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"network"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Network",
|
|
"panel_title": "Top Namespaces",
|
|
"panel_id": 9,
|
|
"panel_type": "table",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"network"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Network",
|
|
"panel_title": "Top Pods",
|
|
"panel_id": 10,
|
|
"panel_type": "table",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"network"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Network",
|
|
"panel_title": "Traefik Routers (req/s)",
|
|
"panel_id": 11,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"network"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Network",
|
|
"panel_title": "Traefik Entrypoints (req/s)",
|
|
"panel_id": 12,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"network"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Nodes",
|
|
"panel_title": "Worker Nodes Ready",
|
|
"panel_id": 1,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"nodes"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Nodes",
|
|
"panel_title": "Control Plane Ready",
|
|
"panel_id": 2,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"nodes"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Nodes",
|
|
"panel_title": "Control Plane Workloads",
|
|
"panel_id": 3,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"nodes"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Nodes",
|
|
"panel_title": "API Server 5xx rate",
|
|
"panel_id": 9,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"nodes"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Nodes",
|
|
"panel_title": "API Server P99 latency",
|
|
"panel_id": 10,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"nodes"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Nodes",
|
|
"panel_title": "etcd P99 latency",
|
|
"panel_id": 11,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"nodes"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Nodes",
|
|
"panel_title": "Node CPU",
|
|
"panel_id": 4,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"nodes"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Nodes",
|
|
"panel_title": "Node RAM",
|
|
"panel_id": 5,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"nodes"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Nodes",
|
|
"panel_title": "Control Plane (incl. titan-db) CPU",
|
|
"panel_id": 6,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"nodes"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Nodes",
|
|
"panel_title": "Control Plane (incl. titan-db) RAM",
|
|
"panel_id": 7,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"nodes"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Nodes",
|
|
"panel_title": "Root Filesystem Usage",
|
|
"panel_id": 8,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"nodes"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Control Plane Ready",
|
|
"panel_id": 2,
|
|
"panel_type": "gauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Control Plane Workloads",
|
|
"panel_id": 3,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Stuck Terminating",
|
|
"panel_id": 5,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Atlas Availability",
|
|
"panel_id": 27,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Problem Pods",
|
|
"panel_id": 4,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "CrashLoop / ImagePull",
|
|
"panel_id": 6,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Workers Ready",
|
|
"panel_id": 1,
|
|
"panel_type": "gauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Hottest node: CPU",
|
|
"panel_id": 7,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Hottest node: RAM",
|
|
"panel_id": 8,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Hottest node: NET (rx+tx)",
|
|
"panel_id": 9,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Hottest node: I/O (r+w)",
|
|
"panel_id": 10,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Mail Sent (1d)",
|
|
"panel_id": 30,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"max(postmark_outbound_sent{window=\"1d\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Mail Bounces (1d)",
|
|
"panel_id": 31,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"max(postmark_outbound_bounce_rate{window=\"1d\"})",
|
|
"max(postmark_outbound_bounced{window=\"1d\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Mail Success Rate (1d)",
|
|
"panel_id": 32,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Mail Limit Used (30d)",
|
|
"panel_id": 33,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"max(postmark_sending_limit_used_percent)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Postgres Connections Used",
|
|
"panel_id": 34,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Postgres Hottest Connections",
|
|
"panel_id": 35,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"topk(1, sum by (datname) (pg_stat_activity_count))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Astreae Usage",
|
|
"panel_id": 23,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Asteria Usage",
|
|
"panel_id": 24,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Astreae Free",
|
|
"panel_id": 25,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Asteria Free",
|
|
"panel_id": 26,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "One-off Job Pods (age hours)",
|
|
"panel_id": 40,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Ariadne Attempts / Failures",
|
|
"panel_id": 41,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(increase(ariadne_task_runs_total[$__interval]))",
|
|
"sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Ariadne Test Success Rate",
|
|
"panel_id": 42,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Tests with Failures (24h)",
|
|
"panel_id": 43,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Namespace CPU Share",
|
|
"panel_id": 11,
|
|
"panel_type": "piechart",
|
|
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Namespace GPU Share",
|
|
"panel_id": 12,
|
|
"panel_type": "piechart",
|
|
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Namespace RAM Share",
|
|
"panel_id": 13,
|
|
"panel_type": "piechart",
|
|
"description": "Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Worker Node CPU",
|
|
"panel_id": 14,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Worker Node RAM",
|
|
"panel_id": 15,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Control plane CPU",
|
|
"panel_id": 16,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Control plane RAM",
|
|
"panel_id": 17,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Node Pod Share",
|
|
"panel_id": 28,
|
|
"panel_type": "piechart",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Top Nodes by Pod Count",
|
|
"panel_id": 29,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Cluster Ingress Throughput",
|
|
"panel_id": 18,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Cluster Egress Throughput",
|
|
"panel_id": 19,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Intra-Cluster Throughput",
|
|
"panel_id": 20,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Root Filesystem Usage",
|
|
"panel_id": 21,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Overview",
|
|
"panel_title": "Nodes Closest to Full Root Disks",
|
|
"panel_id": 22,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"overview"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Pods",
|
|
"panel_title": "Problem Pods",
|
|
"panel_id": 1,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"pods"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Pods",
|
|
"panel_title": "CrashLoop / ImagePull",
|
|
"panel_id": 2,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"pods"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Pods",
|
|
"panel_title": "Stuck Terminating (>10m)",
|
|
"panel_id": 3,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"pods"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Pods",
|
|
"panel_title": "Control Plane Workloads",
|
|
"panel_id": 4,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"pods"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Pods",
|
|
"panel_title": "Pods Not Running",
|
|
"panel_id": 5,
|
|
"panel_type": "table",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"pods"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Pods",
|
|
"panel_title": "CrashLoop / ImagePull",
|
|
"panel_id": 6,
|
|
"panel_type": "table",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"pods"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Pods",
|
|
"panel_title": "Terminating >10m",
|
|
"panel_id": 7,
|
|
"panel_type": "table",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"pods"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Pods",
|
|
"panel_title": "Node Pod Share",
|
|
"panel_id": 8,
|
|
"panel_type": "piechart",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"pods"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Pods",
|
|
"panel_title": "Top Nodes by Pod Count",
|
|
"panel_id": 9,
|
|
"panel_type": "bargauge",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"pods"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Pods",
|
|
"panel_title": "Namespace Plurality by Node v27",
|
|
"panel_id": 10,
|
|
"panel_type": "table",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"pods"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Storage",
|
|
"panel_title": "Astreae Usage",
|
|
"panel_id": 1,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"storage"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Storage",
|
|
"panel_title": "Asteria Usage",
|
|
"panel_id": 2,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"storage"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Storage",
|
|
"panel_title": "Astreae Free",
|
|
"panel_id": 3,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"storage"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Storage",
|
|
"panel_title": "Asteria Free",
|
|
"panel_id": 4,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"storage"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Storage",
|
|
"panel_title": "Astreae Per-Node Usage",
|
|
"panel_id": 5,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"storage"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Storage",
|
|
"panel_title": "Asteria Per-Node Usage",
|
|
"panel_id": 6,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"storage"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Storage",
|
|
"panel_title": "Astreae Usage History",
|
|
"panel_id": 7,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"storage"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Storage",
|
|
"panel_title": "Asteria Usage History",
|
|
"panel_id": 8,
|
|
"panel_type": "timeseries",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"storage"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Storage",
|
|
"panel_title": "Maintenance Sweepers Ready",
|
|
"panel_id": 30,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"storage"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100"
|
|
]
|
|
},
|
|
{
|
|
"dashboard": "Atlas Storage",
|
|
"panel_title": "Maintenance Cron Freshness (s)",
|
|
"panel_id": 31,
|
|
"panel_type": "stat",
|
|
"description": "",
|
|
"tags": [
|
|
"atlas",
|
|
"storage"
|
|
],
|
|
"datasource_uid": "atlas-vm",
|
|
"datasource_type": "prometheus",
|
|
"exprs": [
|
|
"time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})"
|
|
]
|
|
}
|
|
]
|