From 293cd8399994608e588492d9313bf2f5c89324f2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 9 Apr 2026 13:39:55 -0300 Subject: [PATCH] monitoring/atlas: resize test/ops rows and source overview tests from atlas-jobs --- scripts/dashboards_render_atlas.py | 50 ++++---- .../monitoring/dashboards/atlas-overview.json | 109 ++++++++++-------- .../grafana-dashboard-overview.yaml | 109 ++++++++++-------- 3 files changed, 142 insertions(+), 126 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 938825b4..c45a8b7f 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -1237,7 +1237,7 @@ def build_overview(): panel_id, title, f"{expr}", - {"h": 3, "w": 6, "x": 6 * idx, "y": 5}, + {"h": 2, "w": 6, "x": 6 * idx, "y": 5}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="name_and_value", @@ -1286,7 +1286,7 @@ def build_overview(): panel_id, title, expr, - {"h": 3, "w": 6, "x": 6 * idx, "y": 8}, + {"h": 2, "w": 6, "x": 6 * idx, "y": 7}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, links=link_to("atlas-storage"), @@ -1308,7 +1308,7 @@ def build_overview(): 40, "UPS Current Load", None, - {"h": 6, "w": 4, "x": 0, "y": 14}, + {"h": 5, "w": 4, "x": 0, "y": 13}, unit="none", decimals=1, text_mode="name_and_value", @@ -1355,7 +1355,7 @@ def build_overview(): 41, "UPS History (Power Draw)", None, - {"h": 6, "w": 4, "x": 4, "y": 14}, + {"h": 5, "w": 4, "x": 4, "y": 13}, unit="watt", targets=[ {"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB_SERIES, "legendFormat": ANANKE_UPS_DB_NAME}, @@ -1372,7 +1372,7 @@ def build_overview(): 42, "Current Climate", None, - {"h": 6, "w": 4, "x": 8, "y": 14}, + {"h": 5, "w": 4, "x": 8, "y": 13}, unit="none", decimals=2, text_mode="value", @@ -1395,7 +1395,7 @@ def build_overview(): 43, "Climate History", None, - {"h": 6, "w": 4, "x": 12, "y": 14}, + {"h": 5, "w": 4, "x": 12, "y": 13}, unit="celsius", targets=[ {"refId": "A", "expr": CLIMATE_TEMP_SERIES, "legendFormat": "Temperature (°C)"}, @@ -1422,7 +1422,7 @@ def build_overview(): 140, "Fan Activity", None, - {"h": 6, "w": 4, "x": 16, "y": 14}, + {"h": 5, "w": 4, "x": 16, "y": 13}, unit="none", decimals=0, text_mode="name_and_value", @@ -1448,7 +1448,7 @@ def build_overview(): 141, "Fan History (0-10)", None, - {"h": 6, "w": 4, "x": 20, "y": 14}, + {"h": 5, "w": 4, "x": 20, "y": 13}, unit="none", max_value=10, targets=[ @@ -1468,7 +1468,7 @@ def build_overview(): 44, "One-off Job Pods >1h", f"({ONEOFF_JOB_POD_AGE_HOURS}) > 1", - {"h": 3, "w": 6, "x": 0, "y": 11}, + {"h": 4, "w": 6, "x": 0, "y": 9}, unit="h", instant=True, transformations=[ @@ -1484,7 +1484,7 @@ def build_overview(): 45, "Ariadne Attempts (24h)", "sum(increase(ariadne_task_runs_total[24h]))", - {"h": 3, "w": 6, "x": 6, "y": 11}, + {"h": 4, "w": 6, "x": 6, "y": 9}, unit="none", decimals=0, links=link_to("atlas-jobs"), @@ -1494,29 +1494,31 @@ def build_overview(): 46, "Platform Test Success Rate", None, - {"h": 3, "w": 6, "x": 12, "y": 11}, + {"h": 4, "w": 6, "x": 12, "y": 9}, unit="percent", targets=[ { "refId": "A", - "expr": PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES, - "legendFormat": "{{suite}}", + "datasource": {"type": "datasource", "uid": "-- Dashboard --"}, + "dashboardUid": "atlas-jobs", + "panelId": 19, } ], - legend_display="list", - legend_placement="bottom", + legend_display="table", + legend_placement="right", + legend_calcs=["last"], links=link_to("atlas-jobs"), ) + test_success["datasource"] = {"type": "datasource", "uid": "-- Dashboard --"} test_success["description"] = ( - "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). " - "Each line tracks pass percentage over time for its suite." + "Sourced directly from Atlas Jobs internal panel #19 (atlas-jobs) so Overview reuses the exact same suite-level test stream." ) panels.append(test_success) test_failures = stat_panel( 47, "Platform Test Failures (24h)", TEST_FAILURES_24H_TOTAL, - {"h": 3, "w": 6, "x": 18, "y": 11}, + {"h": 4, "w": 6, "x": 18, "y": 9}, unit="none", decimals=0, instant=True, @@ -1533,7 +1535,7 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 3, "w": 4, "x": 0, "y": 20}, + {"h": 2, "w": 4, "x": 0, "y": 18}, unit="none", links=link_to("atlas-mail"), ) @@ -1544,7 +1546,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 3, "w": 4, "x": 8, "y": 20}, + "gridPos": {"h": 2, "w": 4, "x": 8, "y": 18}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1590,7 +1592,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 3, "w": 4, "x": 4, "y": 20}, + {"h": 2, "w": 4, "x": 4, "y": 18}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1602,7 +1604,7 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 3, "w": 4, "x": 12, "y": 20}, + {"h": 2, "w": 4, "x": 12, "y": 18}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, @@ -1614,7 +1616,7 @@ def build_overview(): 34, "Postgres Connections Used", POSTGRES_CONN_USED, - {"h": 3, "w": 4, "x": 16, "y": 20}, + {"h": 2, "w": 4, "x": 16, "y": 18}, decimals=0, text_mode="name_and_value", legend="{{conn}}", @@ -1626,7 +1628,7 @@ def build_overview(): 35, "Postgres Hottest Connections", POSTGRES_CONN_HOTTEST, - {"h": 3, "w": 4, "x": 20, "y": 20}, + {"h": 2, "w": 4, "x": 20, "y": 18}, unit="none", decimals=0, text_mode="name_and_value", diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index ec4be052..b3ae3ef0 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -503,7 +503,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 0, "y": 5 @@ -580,7 +580,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 6, "y": 5 @@ -657,7 +657,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 12, "y": 5 @@ -726,7 +726,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 18, "y": 5 @@ -795,10 +795,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 0, - "y": 8 + "y": 7 }, "targets": [ { @@ -870,10 +870,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 6, - "y": 8 + "y": 7 }, "targets": [ { @@ -945,10 +945,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 12, - "y": 8 + "y": 7 }, "targets": [ { @@ -1012,10 +1012,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 18, - "y": 8 + "y": 7 }, "targets": [ { @@ -1079,10 +1079,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 4, "x": 0, - "y": 14 + "y": 13 }, "targets": [ { @@ -1301,10 +1301,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 4, "x": 4, - "y": 14 + "y": 13 }, "targets": [ { @@ -1355,10 +1355,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 4, "x": 8, - "y": 14 + "y": 13 }, "targets": [ { @@ -1459,10 +1459,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 4, "x": 12, - "y": 14 + "y": 13 }, "targets": [ { @@ -1533,10 +1533,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 4, "x": 16, - "y": 14 + "y": 13 }, "targets": [ { @@ -1625,10 +1625,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 4, "x": 20, - "y": 14 + "y": 13 }, "targets": [ { @@ -1685,10 +1685,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 4, "w": 6, "x": 0, - "y": 11 + "y": 9 }, "targets": [ { @@ -1744,10 +1744,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 4, "w": 6, "x": 6, - "y": 11 + "y": 9 }, "targets": [ { @@ -1808,20 +1808,24 @@ "type": "timeseries", "title": "Platform Test Success Rate", "datasource": { - "type": "prometheus", - "uid": "atlas-vm" + "type": "datasource", + "uid": "-- Dashboard --" }, "gridPos": { - "h": 3, + "h": 4, "w": 6, "x": 12, - "y": 11 + "y": 9 }, "targets": [ { "refId": "A", - "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")", - "legendFormat": "{{suite}}" + "datasource": { + "type": "datasource", + "uid": "-- Dashboard --" + }, + "dashboardUid": "atlas-jobs", + "panelId": 19 } ], "fieldConfig": { @@ -1832,8 +1836,11 @@ }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] }, "tooltip": { "mode": "multi" @@ -1846,7 +1853,7 @@ "targetBlank": true } ], - "description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). Each line tracks pass percentage over time for its suite." + "description": "Sourced directly from Atlas Jobs internal panel #19 (atlas-jobs) so Overview reuses the exact same suite-level test stream." }, { "id": 47, @@ -1857,10 +1864,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 4, "w": 6, "x": 18, - "y": 11 + "y": 9 }, "targets": [ { @@ -1935,10 +1942,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 4, "x": 0, - "y": 20 + "y": 18 }, "targets": [ { @@ -2002,10 +2009,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 4, "x": 8, - "y": 20 + "y": 18 }, "targets": [ { @@ -2107,10 +2114,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 4, "x": 4, - "y": 20 + "y": 18 }, "targets": [ { @@ -2183,10 +2190,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 4, "x": 12, - "y": 20 + "y": 18 }, "targets": [ { @@ -2259,10 +2266,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 4, "x": 16, - "y": 20 + "y": 18 }, "targets": [ { @@ -2322,10 +2329,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 4, "x": 20, - "y": 20 + "y": 18 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index ececba18..d5e9ae93 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -512,7 +512,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 0, "y": 5 @@ -589,7 +589,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 6, "y": 5 @@ -666,7 +666,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 12, "y": 5 @@ -735,7 +735,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 18, "y": 5 @@ -804,10 +804,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 0, - "y": 8 + "y": 7 }, "targets": [ { @@ -879,10 +879,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 6, - "y": 8 + "y": 7 }, "targets": [ { @@ -954,10 +954,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 12, - "y": 8 + "y": 7 }, "targets": [ { @@ -1021,10 +1021,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 6, "x": 18, - "y": 8 + "y": 7 }, "targets": [ { @@ -1088,10 +1088,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 4, "x": 0, - "y": 14 + "y": 13 }, "targets": [ { @@ -1310,10 +1310,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 4, "x": 4, - "y": 14 + "y": 13 }, "targets": [ { @@ -1364,10 +1364,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 4, "x": 8, - "y": 14 + "y": 13 }, "targets": [ { @@ -1468,10 +1468,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 4, "x": 12, - "y": 14 + "y": 13 }, "targets": [ { @@ -1542,10 +1542,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 4, "x": 16, - "y": 14 + "y": 13 }, "targets": [ { @@ -1634,10 +1634,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 5, "w": 4, "x": 20, - "y": 14 + "y": 13 }, "targets": [ { @@ -1694,10 +1694,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 4, "w": 6, "x": 0, - "y": 11 + "y": 9 }, "targets": [ { @@ -1753,10 +1753,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 4, "w": 6, "x": 6, - "y": 11 + "y": 9 }, "targets": [ { @@ -1817,20 +1817,24 @@ data: "type": "timeseries", "title": "Platform Test Success Rate", "datasource": { - "type": "prometheus", - "uid": "atlas-vm" + "type": "datasource", + "uid": "-- Dashboard --" }, "gridPos": { - "h": 3, + "h": 4, "w": 6, "x": 12, - "y": 11 + "y": 9 }, "targets": [ { "refId": "A", - "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")", - "legendFormat": "{{suite}}" + "datasource": { + "type": "datasource", + "uid": "-- Dashboard --" + }, + "dashboardUid": "atlas-jobs", + "panelId": 19 } ], "fieldConfig": { @@ -1841,8 +1845,11 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] }, "tooltip": { "mode": "multi" @@ -1855,7 +1862,7 @@ data: "targetBlank": true } ], - "description": "Rolling pass rate per platform suite (Ariadne task automation + Metis build/flash pipelines + Ananke quality gate). Each line tracks pass percentage over time for its suite." + "description": "Sourced directly from Atlas Jobs internal panel #19 (atlas-jobs) so Overview reuses the exact same suite-level test stream." }, { "id": 47, @@ -1866,10 +1873,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 4, "w": 6, "x": 18, - "y": 11 + "y": 9 }, "targets": [ { @@ -1944,10 +1951,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 4, "x": 0, - "y": 20 + "y": 18 }, "targets": [ { @@ -2011,10 +2018,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 4, "x": 8, - "y": 20 + "y": 18 }, "targets": [ { @@ -2116,10 +2123,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 4, "x": 4, - "y": 20 + "y": 18 }, "targets": [ { @@ -2192,10 +2199,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 4, "x": 12, - "y": 20 + "y": 18 }, "targets": [ { @@ -2268,10 +2275,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 4, "x": 16, - "y": 20 + "y": 18 }, "targets": [ { @@ -2331,10 +2338,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 3, + "h": 2, "w": 4, "x": 20, - "y": 20 + "y": 18 }, "targets": [ {