From 33b5e2b678191b9c357f9ea64c8a03c02f9eae23 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 26 Jan 2026 14:08:11 -0300 Subject: [PATCH] atlasbot: add metrics kb and long timeout --- knowledge/catalog/metrics.json | 1880 +++++++++++++++++ scripts/knowledge_render_atlas.py | 65 + .../bstein-dev-home/backend-deployment.yaml | 4 +- .../chat-ai-gateway-deployment.yaml | 2 + services/bstein-dev-home/scripts/gateway.py | 3 +- services/comms/atlasbot-deployment.yaml | 8 +- services/comms/knowledge/catalog/metrics.json | 1880 +++++++++++++++++ services/comms/kustomization.yaml | 1 + services/comms/scripts/atlasbot/bot.py | 97 +- 9 files changed, 3934 insertions(+), 6 deletions(-) create mode 100644 knowledge/catalog/metrics.json create mode 100644 services/comms/knowledge/catalog/metrics.json diff --git a/knowledge/catalog/metrics.json b/knowledge/catalog/metrics.json new file mode 100644 index 0000000..e929db5 --- /dev/null +++ b/knowledge/catalog/metrics.json @@ -0,0 +1,1880 @@ +[ + { + "dashboard": "Atlas GPU", + "panel_title": "Namespace GPU Share", + "panel_id": 1, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Namespace", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Node", + "panel_id": 3, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "Top Pods by GPU Util", + "panel_id": 4, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (range)", + "panel_id": 1, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 3, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Stale (>36h)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Missing Success", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Suspended", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (24h)", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Runs (1h)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Error (hours ago)", + "panel_id": 10, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Success (hours ago)", + "panel_id": 11, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Success (hours ago)", + "panel_id": 12, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Schedule (hours ago)", + "panel_id": 13, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 14, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (30d)", + "panel_id": 15, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Access Requests", + "panel_id": 16, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(ariadne_access_requests_total)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Coverage (%)", + "panel_id": 17, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_coverage_percent{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Tests (latest)", + "panel_id": 18, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_tests_total{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (7d)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"7d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Mail Bounces (1d)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Success Rate (1d)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Limit Used (30d)", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Send Limit (30d)", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Last Success", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_last_success_timestamp_seconds)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounce Rate (1d vs 7d)", + "panel_id": 13, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounce_rate)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounced (1d vs 7d)", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounced)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d vs 7d)", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_sent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Success Rate (5m)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (1h)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (6h)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Edge P99 Latency (ms)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Traffic", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Egress Traffic", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Intra-Cluster Traffic", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Per-Node Throughput", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Namespaces", + "panel_id": 9, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Pods", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Routers (req/s)", + "panel_id": 11, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Entrypoints (req/s)", + "panel_id": 12, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Worker Nodes Ready", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server 5xx rate", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server P99 latency", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "etcd P99 latency", + "panel_id": 11, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node CPU", + "panel_id": 4, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node RAM", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) CPU", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) RAM", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Root Filesystem Usage", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Stuck Terminating", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Atlas Availability", + "panel_id": 27, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Problem Pods", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Workers Ready", + "panel_id": 1, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: CPU", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: RAM", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: NET (rx+tx)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: I/O (r+w)", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Sent (1d)", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Bounces (1d)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Success Rate (1d)", + "panel_id": 32, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Limit Used (30d)", + "panel_id": 33, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Connections Used", + "panel_id": 34, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Hottest Connections", + "panel_id": 35, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(1, sum by (datname) (pg_stat_activity_count))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Usage", + "panel_id": 23, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Usage", + "panel_id": 24, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Free", + "panel_id": 25, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Free", + "panel_id": 26, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 40, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 41, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Test Success Rate", + "panel_id": 42, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Tests with Failures (24h)", + "panel_id": 43, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace CPU Share", + "panel_id": 11, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace GPU Share", + "panel_id": 12, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace RAM Share", + "panel_id": 13, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node CPU", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node RAM", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane CPU", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane RAM", + "panel_id": 17, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Node Pod Share", + "panel_id": 28, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 29, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Ingress Throughput", + "panel_id": 18, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Egress Throughput", + "panel_id": 19, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Intra-Cluster Throughput", + "panel_id": 20, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Root Filesystem Usage", + "panel_id": 21, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Nodes Closest to Full Root Disks", + "panel_id": 22, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Problem Pods", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Stuck Terminating (>10m)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Control Plane Workloads", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Pods Not Running", + "panel_id": 5, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Terminating >10m", + "panel_id": 7, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Node Pod Share", + "panel_id": 8, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 9, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Namespace Plurality by Node v27", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Free", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Free", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Per-Node Usage", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Per-Node Usage", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage History", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage History", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Sweepers Ready", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Cron Freshness (s)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})" + ] + } +] diff --git a/scripts/knowledge_render_atlas.py b/scripts/knowledge_render_atlas.py index 206dcd9..1e305cb 100644 --- a/scripts/knowledge_render_atlas.py +++ b/scripts/knowledge_render_atlas.py @@ -26,6 +26,7 @@ from typing import Any, Iterable import yaml REPO_ROOT = Path(__file__).resolve().parents[1] +DASHBOARD_DIR = REPO_ROOT / "services" / "monitoring" / "dashboards" CLUSTER_SCOPED_KINDS = { "Namespace", @@ -67,6 +68,64 @@ def _sync_tree(source: Path, dest: Path) -> None: shutil.copytree(source, dest) +def _iter_dashboard_panels(dashboard: dict[str, Any]) -> Iterable[dict[str, Any]]: + panels = dashboard.get("panels") if isinstance(dashboard.get("panels"), list) else [] + for panel in panels: + if not isinstance(panel, dict): + continue + if panel.get("type") == "row" and isinstance(panel.get("panels"), list): + yield from _iter_dashboard_panels({"panels": panel.get("panels")}) + continue + yield panel + + +def _extract_metrics_index(dashboard_dir: Path) -> list[dict[str, Any]]: + index: list[dict[str, Any]] = [] + for path in sorted(dashboard_dir.glob("*.json")): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + continue + if not isinstance(data, dict): + continue + dash_title = data.get("title") or path.stem + dash_tags = data.get("tags") or [] + for panel in _iter_dashboard_panels(data): + targets = panel.get("targets") + if not isinstance(targets, list): + continue + exprs: list[str] = [] + for target in targets: + if not isinstance(target, dict): + continue + expr = target.get("expr") + if isinstance(expr, str) and expr.strip(): + exprs.append(expr.strip()) + if not exprs: + continue + datasource = panel.get("datasource") or {} + if isinstance(datasource, dict): + ds_uid = datasource.get("uid") + ds_type = datasource.get("type") + else: + ds_uid = None + ds_type = None + index.append( + { + "dashboard": dash_title, + "panel_title": panel.get("title") or "", + "panel_id": panel.get("id"), + "panel_type": panel.get("type"), + "description": panel.get("description") or "", + "tags": dash_tags, + "datasource_uid": ds_uid, + "datasource_type": ds_type, + "exprs": exprs, + } + ) + return index + + def kustomize_build(path: Path) -> str: rel = path.relative_to(REPO_ROOT) try: @@ -516,6 +575,7 @@ def main() -> int: summary_path = out_dir / "catalog" / "atlas-summary.json" diagram_path = out_dir / "diagrams" / "atlas-http.mmd" runbooks_json_path = out_dir / "catalog" / "runbooks.json" + metrics_json_path = out_dir / "catalog" / "metrics.json" catalog_rel = catalog_path.relative_to(REPO_ROOT).as_posix() catalog_path.write_text( @@ -560,12 +620,17 @@ def main() -> int: } ) runbooks_json_path.write_text(json.dumps(runbooks, indent=2, sort_keys=False) + "\n", encoding="utf-8") + metrics_index = _extract_metrics_index(DASHBOARD_DIR) + metrics_json_path.write_text( + json.dumps(metrics_index, indent=2, sort_keys=False) + "\n", encoding="utf-8" + ) print(f"Wrote {catalog_path.relative_to(REPO_ROOT)}") print(f"Wrote {catalog_json_path.relative_to(REPO_ROOT)}") print(f"Wrote {summary_path.relative_to(REPO_ROOT)}") print(f"Wrote {diagram_path.relative_to(REPO_ROOT)}") print(f"Wrote {runbooks_json_path.relative_to(REPO_ROOT)}") + print(f"Wrote {metrics_json_path.relative_to(REPO_ROOT)}") if args.sync_comms: comms_dir = REPO_ROOT / "services" / "comms" / "knowledge" diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 2170396..ecf478c 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -58,14 +58,14 @@ spec: args: - >- . /vault/secrets/portal-env.sh - && exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 180 app:app + && exec gunicorn -b 0.0.0.0:8080 --workers 2 --timeout 600 app:app env: - name: AI_CHAT_API value: http://ollama.ai.svc.cluster.local:11434 - name: AI_CHAT_MODEL value: qwen2.5-coder:7b-instruct-q4_0 - name: AI_CHAT_TIMEOUT_SEC - value: "60" + value: "480" - name: AI_NODE_NAME valueFrom: fieldRef: diff --git a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml index 40d74fe..7209da6 100644 --- a/services/bstein-dev-home/chat-ai-gateway-deployment.yaml +++ b/services/bstein-dev-home/chat-ai-gateway-deployment.yaml @@ -47,6 +47,8 @@ spec: env: - name: UPSTREAM_URL value: http://bstein-dev-home-backend/api/chat + - name: UPSTREAM_TIMEOUT_SEC + value: "600" ports: - name: http containerPort: 8080 diff --git a/services/bstein-dev-home/scripts/gateway.py b/services/bstein-dev-home/scripts/gateway.py index 3ca2fa1..19d3606 100644 --- a/services/bstein-dev-home/scripts/gateway.py +++ b/services/bstein-dev-home/scripts/gateway.py @@ -6,6 +6,7 @@ from urllib import request, error UPSTREAM = os.environ.get("UPSTREAM_URL", "http://bstein-dev-home-backend/api/chat") KEY_MATRIX = os.environ.get("CHAT_KEY_MATRIX", "") KEY_HOMEPAGE = os.environ.get("CHAT_KEY_HOMEPAGE", "") +UPSTREAM_TIMEOUT_SEC = float(os.environ.get("UPSTREAM_TIMEOUT_SEC", "90")) ALLOWED = {k for k in (KEY_MATRIX, KEY_HOMEPAGE) if k} @@ -41,7 +42,7 @@ class Handler(BaseHTTPRequestHandler): headers={"Content-Type": "application/json"}, method="POST", ) - with request.urlopen(upstream_req, timeout=90) as resp: + with request.urlopen(upstream_req, timeout=UPSTREAM_TIMEOUT_SEC) as resp: data = resp.read() self.send_response(resp.status) for k, v in resp.headers.items(): diff --git a/services/comms/atlasbot-deployment.yaml b/services/comms/atlasbot-deployment.yaml index 2c08853..031abb8 100644 --- a/services/comms/atlasbot-deployment.yaml +++ b/services/comms/atlasbot-deployment.yaml @@ -16,7 +16,7 @@ spec: labels: app: atlasbot annotations: - checksum/atlasbot-configmap: manual-atlasbot-12 + checksum/atlasbot-configmap: manual-atlasbot-13 vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "comms" vault.hashicorp.com/agent-inject-secret-turn-secret: "kv/data/atlas/comms/turn-shared-secret" @@ -83,6 +83,10 @@ spec: value: http://chat-ai-gateway.bstein-dev-home.svc.cluster.local/ - name: OLLAMA_MODEL value: qwen2.5-coder:7b-instruct-q4_0 + - name: OLLAMA_TIMEOUT_SEC + value: "480" + - name: ATLASBOT_THINKING_INTERVAL_SEC + value: "120" resources: requests: cpu: 100m @@ -114,6 +118,8 @@ spec: path: catalog/atlas.json - key: atlas-summary.json path: catalog/atlas-summary.json + - key: metrics.json + path: catalog/metrics.json - key: runbooks.json path: catalog/runbooks.json - key: atlas-http.mmd diff --git a/services/comms/knowledge/catalog/metrics.json b/services/comms/knowledge/catalog/metrics.json new file mode 100644 index 0000000..e929db5 --- /dev/null +++ b/services/comms/knowledge/catalog/metrics.json @@ -0,0 +1,1880 @@ +[ + { + "dashboard": "Atlas GPU", + "panel_title": "Namespace GPU Share", + "panel_id": 1, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Namespace", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "GPU Util by Node", + "panel_id": 3, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})" + ] + }, + { + "dashboard": "Atlas GPU", + "panel_title": "Top Pods by GPU Util", + "panel_id": 4, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "gpu" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (range)", + "panel_id": 1, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 2, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 3, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Stale (>36h)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Missing Success", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Suspended", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (24h)", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Runs (1h)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[1h]))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Error (hours ago)", + "panel_id": 10, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Schedule Last Success (hours ago)", + "panel_id": 11, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Success (hours ago)", + "panel_id": 12, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Glue Jobs Last Schedule (hours ago)", + "panel_id": 13, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (1h)", + "panel_id": 14, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Task Errors (30d)", + "panel_id": 15, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne Access Requests", + "panel_id": 16, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(ariadne_access_requests_total)" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Coverage (%)", + "panel_id": 17, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_coverage_percent{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Jobs", + "panel_title": "Ariadne CI Tests (latest)", + "panel_id": 18, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "jobs", + "glue" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "ariadne_ci_tests_total{repo=\"ariadne\"}" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (7d)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"7d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Mail Bounces (1d)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Success Rate (1d)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Limit Used (30d)", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Send Limit (30d)", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Last Success", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_last_success_timestamp_seconds)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounce Rate (1d vs 7d)", + "panel_id": 13, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounce_rate)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Bounced (1d vs 7d)", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_bounced)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Sent (1d vs 7d)", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max by (window) (postmark_outbound_sent)" + ] + }, + { + "dashboard": "Atlas Mail", + "panel_title": "Exporter Errors", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "mail" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(postmark_request_errors_total)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Success Rate (5m)", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (1h)", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Error Budget Burn (6h)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Edge P99 Latency (ms)", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Ingress Traffic", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Egress Traffic", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Intra-Cluster Traffic", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Per-Node Throughput", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Namespaces", + "panel_id": 9, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Top Pods", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Routers (req/s)", + "panel_id": 11, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))" + ] + }, + { + "dashboard": "Atlas Network", + "panel_title": "Traefik Entrypoints (req/s)", + "panel_id": 12, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "network" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Worker Nodes Ready", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server 5xx rate", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "API Server P99 latency", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "etcd P99 latency", + "panel_id": 11, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node CPU", + "panel_id": 4, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Node RAM", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) CPU", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Control Plane (incl. titan-db) RAM", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Nodes", + "panel_title": "Root Filesystem Usage", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "nodes" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Ready", + "panel_id": 2, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control Plane Workloads", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"}) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Stuck Terminating", + "panel_id": 5, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Atlas Availability", + "panel_id": 27, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Problem Pods", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Workers Ready", + "panel_id": 1, + "panel_type": "gauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: CPU", + "panel_id": 7, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: RAM", + "panel_id": 8, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: NET (rx+tx)", + "panel_id": 9, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Hottest node: I/O (r+w)", + "panel_id": 10, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Sent (1d)", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_sent{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Bounces (1d)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_outbound_bounce_rate{window=\"1d\"})", + "max(postmark_outbound_bounced{window=\"1d\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Success Rate (1d)", + "panel_id": 32, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "clamp_min(100 - max(postmark_outbound_bounce_rate{window=\"1d\"}), 0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Mail Limit Used (30d)", + "panel_id": 33, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "max(postmark_sending_limit_used_percent)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Connections Used", + "panel_id": 34, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "label_replace(sum(pg_stat_activity_count), \"conn\", \"used\", \"__name__\", \".*\") or label_replace(max(pg_settings_max_connections), \"conn\", \"max\", \"__name__\", \".*\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Postgres Hottest Connections", + "panel_id": 35, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "topk(1, sum by (datname) (pg_stat_activity_count))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Usage", + "panel_id": 23, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Usage", + "panel_id": 24, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Astreae Free", + "panel_id": 25, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Asteria Free", + "panel_id": 26, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "One-off Job Pods (age hours)", + "panel_id": 40, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Attempts / Failures", + "panel_id": 41, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(increase(ariadne_task_runs_total[$__interval]))", + "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Ariadne Test Success Rate", + "panel_id": 42, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[1h])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[1h])), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Tests with Failures (24h)", + "panel_id": 43, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace CPU Share", + "panel_id": 11, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_cpu}[1m])) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace GPU Share", + "panel_id": 12, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Namespace RAM Share", + "panel_id": 13, + "panel_type": "piechart", + "description": "Shares are normalized within the selected filter. Switching scope changes the denominator.", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",container!=\"POD\",$namespace_scope_ram}) by (namespace) ), 1)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node CPU", + "panel_id": 14, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Worker Node RAM", + "panel_id": 15, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane CPU", + "panel_id": 16, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Control plane RAM", + "panel_id": 17, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db|titan-jh\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Node Pod Share", + "panel_id": 28, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 29, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Ingress Throughput", + "panel_id": 18, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Cluster Egress Throughput", + "panel_id": 19, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Intra-Cluster Throughput", + "panel_id": 20, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Root Filesystem Usage", + "panel_id": 21, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))" + ] + }, + { + "dashboard": "Atlas Overview", + "panel_title": "Nodes Closest to Full Root Disks", + "panel_id": 22, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "overview" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Problem Pods", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Stuck Terminating (>10m)", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Control Plane Workloads", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"^(kube-.*|.*-system|traefik|monitoring|logging|cert-manager|maintenance|postgres)$\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Pods Not Running", + "panel_id": 5, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "CrashLoop / ImagePull", + "panel_id": 6, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Terminating >10m", + "panel_id": 7, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Node Pod Share", + "panel_id": 8, + "panel_type": "piechart", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node) / clamp_min(sum(kube_pod_info{pod!=\"\" , node!=\"\"}), 1)) * 100" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Top Nodes by Pod Count", + "panel_id": 9, + "panel_type": "bargauge", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))" + ] + }, + { + "dashboard": "Atlas Pods", + "panel_title": "Namespace Plurality by Node v27", + "panel_id": 10, + "panel_type": "table", + "description": "", + "tags": [ + "atlas", + "pods" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage", + "panel_id": 1, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage", + "panel_id": 2, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Free", + "panel_id": 3, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Free", + "panel_id": 4, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Per-Node Usage", + "panel_id": 5, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Per-Node Usage", + "panel_id": 6, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Astreae Usage History", + "panel_id": 7, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Asteria Usage History", + "panel_id": 8, + "panel_type": "timeseries", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Sweepers Ready", + "panel_id": 30, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "kube_daemonset_status_number_ready{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace=\"maintenance\",daemonset=\"node-image-sweeper\"} * 100" + ] + }, + { + "dashboard": "Atlas Storage", + "panel_title": "Maintenance Cron Freshness (s)", + "panel_id": 31, + "panel_type": "stat", + "description": "", + "tags": [ + "atlas", + "storage" + ], + "datasource_uid": "atlas-vm", + "datasource_type": "prometheus", + "exprs": [ + "time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace=\"maintenance\",cronjob=\"image-sweeper\"})" + ] + } +] diff --git a/services/comms/kustomization.yaml b/services/comms/kustomization.yaml index 3360067..37f681d 100644 --- a/services/comms/kustomization.yaml +++ b/services/comms/kustomization.yaml @@ -73,5 +73,6 @@ configMapGenerator: - INDEX.md=knowledge/INDEX.md - atlas.json=knowledge/catalog/atlas.json - atlas-summary.json=knowledge/catalog/atlas-summary.json + - metrics.json=knowledge/catalog/metrics.json - runbooks.json=knowledge/catalog/runbooks.json - atlas-http.mmd=knowledge/diagrams/atlas-http.mmd diff --git a/services/comms/scripts/atlasbot/bot.py b/services/comms/scripts/atlasbot/bot.py index 8edc28d..e604e65 100644 --- a/services/comms/scripts/atlasbot/bot.py +++ b/services/comms/scripts/atlasbot/bot.py @@ -17,7 +17,7 @@ ROOM_ALIAS = "#othrys:live.bstein.dev" OLLAMA_URL = os.environ.get("OLLAMA_URL", "https://chat.ai.bstein.dev/") MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:7b-instruct-q4_0") API_KEY = os.environ.get("CHAT_API_KEY", "") -OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "90")) +OLLAMA_TIMEOUT_SEC = float(os.environ.get("OLLAMA_TIMEOUT_SEC", "480")) KB_DIR = os.environ.get("KB_DIR", "") VM_URL = os.environ.get("VM_URL", "http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428") @@ -29,6 +29,7 @@ SERVER_NAME = os.environ.get("MATRIX_SERVER_NAME", "live.bstein.dev") MAX_KB_CHARS = int(os.environ.get("ATLASBOT_MAX_KB_CHARS", "2500")) MAX_TOOL_CHARS = int(os.environ.get("ATLASBOT_MAX_TOOL_CHARS", "2500")) +THINKING_INTERVAL_SEC = int(os.environ.get("ATLASBOT_THINKING_INTERVAL_SEC", "120")) TOKEN_RE = re.compile(r"[a-z0-9][a-z0-9_.-]{1,}", re.IGNORECASE) HOST_RE = re.compile(r"(?i)([a-z0-9-]+(?:\\.[a-z0-9-]+)+)") @@ -59,8 +60,21 @@ STOPWORDS = { } METRIC_HINT_WORDS = { + "bandwidth", + "connections", + "cpu", + "database", + "db", + "disk", "health", + "memory", + "network", + "node", + "nodes", + "postgres", "status", + "storage", + "usage", "down", "slow", "error", @@ -157,6 +171,7 @@ def send_msg(token: str, room: str, text: str): KB = {"catalog": {}, "runbooks": []} _HOST_INDEX: dict[str, list[dict]] = {} _NAME_INDEX: set[str] = set() +_METRIC_INDEX: list[dict[str, Any]] = [] _NODE_CLASS_INDEX: dict[str, list[str]] = {} _NODE_CLASS_RPI4: set[str] = set() _NODE_CLASS_RPI5: set[str] = set() @@ -180,6 +195,7 @@ def load_kb(): return catalog = _load_json_file(os.path.join(KB_DIR, "catalog", "atlas.json")) or {} runbooks = _load_json_file(os.path.join(KB_DIR, "catalog", "runbooks.json")) or [] + metrics = _load_json_file(os.path.join(KB_DIR, "catalog", "metrics.json")) or [] KB = {"catalog": catalog, "runbooks": runbooks} host_index: dict[str, list[dict]] = collections.defaultdict(list) @@ -197,6 +213,7 @@ def load_kb(): if isinstance(w, dict) and w.get("name"): names.add(str(w["name"]).lower()) _NAME_INDEX = names + _METRIC_INDEX = metrics if isinstance(metrics, list) else [] node_classes = _parse_node_classes(runbooks) _NODE_CLASS_INDEX = node_classes @@ -356,6 +373,65 @@ def node_inventory_context(query: str) -> str: return "" return "\n".join(lines) +def _metric_tokens(entry: dict[str, Any]) -> str: + parts: list[str] = [] + for key in ("panel_title", "dashboard", "description"): + val = entry.get(key) + if isinstance(val, str) and val: + parts.append(val.lower()) + tags = entry.get("tags") + if isinstance(tags, list): + parts.extend(str(t).lower() for t in tags if t) + return " ".join(parts) + +def metrics_lookup(query: str, limit: int = 3) -> list[dict[str, Any]]: + q_tokens = _tokens(query) + if not q_tokens or not _METRIC_INDEX: + return [] + scored: list[tuple[int, dict[str, Any]]] = [] + for entry in _METRIC_INDEX: + if not isinstance(entry, dict): + continue + hay = _metric_tokens(entry) + if not hay: + continue + score = 0 + for t in set(q_tokens): + if t in hay: + score += 2 if t in (entry.get("panel_title") or "").lower() else 1 + if score: + scored.append((score, entry)) + scored.sort(key=lambda item: item[0], reverse=True) + return [entry for _, entry in scored[:limit]] + +def metrics_query_context(prompt: str, *, allow_tools: bool) -> tuple[str, str]: + if not allow_tools: + return "", "" + lower = (prompt or "").lower() + if not any(word in lower for word in METRIC_HINT_WORDS): + return "", "" + matches = metrics_lookup(prompt, limit=1) + if not matches: + return "", "" + entry = matches[0] + exprs = entry.get("exprs") if isinstance(entry.get("exprs"), list) else [] + if not exprs: + return "", "" + rendered_parts: list[str] = [] + for expr in exprs[:2]: + res = vm_query(expr, timeout=20) + rendered = vm_render_result(res, limit=10) + if rendered: + rendered_parts.append(rendered) + if not rendered_parts: + return "", "" + dashboard = entry.get("dashboard") or "dashboard" + panel = entry.get("panel_title") or "panel" + summary = "\n".join(rendered_parts) + context = f"Metrics (from {dashboard} / {panel}):\n{summary}" + fallback = f"{panel}: {summary}" + return context, fallback + def jetson_nodes_from_kb() -> list[str]: for doc in KB.get("runbooks", []): if not isinstance(doc, dict): @@ -777,6 +853,7 @@ def _ollama_call(hist_key, prompt: str, *, context: str) -> str: "Be helpful, direct, and concise. " "Prefer answering with exact repo paths and Kubernetes resource names. " "Never include or request secret values. " + "Do not suggest commands unless explicitly asked. " "Respond in plain sentences; do not return JSON or code fences unless explicitly asked. " "If the answer is not grounded in the provided context or tool data, say you do not know." ) @@ -820,7 +897,17 @@ def ollama_reply_with_thinking(token: str, room: str, hist_key, prompt: str, *, thread.start() if not done.wait(2.0): send_msg(token, room, "Thinking…") - done.wait() + prompt_hint = " ".join((prompt or "").split()) + if len(prompt_hint) > 160: + prompt_hint = prompt_hint[:157] + "…" + heartbeat = max(10, THINKING_INTERVAL_SEC) + next_heartbeat = time.monotonic() + heartbeat + while not done.wait(max(0, next_heartbeat - time.monotonic())): + if prompt_hint: + send_msg(token, room, f"Still thinking about: {prompt_hint} (gathering context)") + else: + send_msg(token, room, "Still thinking (gathering context)…") + next_heartbeat += heartbeat thread.join(timeout=1) return result["reply"] or fallback or "Model backend is busy. Try again in a moment." @@ -937,9 +1024,15 @@ def sync_loop(token: str, room_id: str): rendered = vm_render_result(res, limit=15) or "(no results)" extra = "VictoriaMetrics (PromQL result):\n" + rendered context = (context + "\n\n" + extra).strip() if context else extra + metrics_context, metrics_fallback = metrics_query_context(body, allow_tools=allow_tools) + if metrics_context: + context = (context + "\n\n" + metrics_context).strip() if context else metrics_context + fallback = "" if "node" in lower_body or "cluster" in lower_body: fallback = node_inventory_answer("Atlas", lower_body) + if metrics_fallback and not fallback: + fallback = metrics_fallback reply = ollama_reply_with_thinking( token, rid,