From 09fa3e716c54c5d12c02dc531dce7c547fb51a44 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 9 Apr 2026 14:56:43 -0300 Subject: [PATCH] monitoring/atlas: merge top rows and fix platform test pass-rate panel --- scripts/dashboards_render_atlas.py | 112 ++++++++---------- .../monitoring/dashboards/atlas-jobs.json | 8 +- .../monitoring/dashboards/atlas-overview.json | 104 ++++++++-------- .../monitoring/dashboards/atlas-power.json | 8 +- .../monitoring/grafana-dashboard-jobs.yaml | 8 +- .../grafana-dashboard-overview.yaml | 104 ++++++++-------- .../monitoring/grafana-dashboard-power.yaml | 8 +- 7 files changed, 170 insertions(+), 182 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index c45a8b7f..317765a6 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -447,18 +447,18 @@ PLATFORM_TEST_ACTIVITY_30D = ( 'or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite="ananke"}[30d])), "source", "ananke-quality", "__name__", ".*")' ) PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES = ( - 'label_replace(100 * (sum(increase(ariadne_task_runs_total{status="ok"}[$__interval])) or on() vector(0)) ' - '/ clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), ' - '"suite", "ariadne", "__name__", ".*") ' - 'or label_replace(100 * (sum(increase(metis_builds_total{status="ok"}[$__interval])) or on() vector(0)) ' - '/ clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), ' - '"suite", "metis-build", "__name__", ".*") ' - 'or label_replace(100 * (sum(increase(metis_flashes_total{status="ok"}[$__interval])) or on() vector(0)) ' - '/ clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), ' - '"suite", "metis-flash", "__name__", ".*") ' - 'or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[$__interval])) or on() vector(0)) ' - '/ clamp_min((sum(increase(ananke_quality_gate_runs_total{suite="ananke"}[$__interval])) or on() vector(0)), 1), ' - '"suite", "ananke-quality", "__name__", ".*")' + 'label_replace(100 * (sum by (task) (increase(ariadne_task_runs_total{status="ok"}[$__interval])) ' + '/ clamp_min(sum by (task) (increase(ariadne_task_runs_total[$__interval])), 1)), ' + '"suite", "ariadne:${1}", "task", "(.*)") ' + 'or label_replace(100 * (sum by (node) (increase(metis_builds_total{status="ok"}[$__interval])) ' + '/ clamp_min(sum by (node) (increase(metis_builds_total[$__interval])), 1)), ' + '"suite", "metis-build:${1}", "node", "(.*)") ' + 'or label_replace(100 * (sum by (node) (increase(metis_flashes_total{status="ok"}[$__interval])) ' + '/ clamp_min(sum by (node) (increase(metis_flashes_total[$__interval])), 1)), ' + '"suite", "metis-flash:${1}", "node", "(.*)") ' + 'or label_replace(100 * (sum by (instance) (increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[$__interval])) ' + '/ clamp_min(sum by (instance) (increase(ananke_quality_gate_runs_total{suite="ananke"}[$__interval])), 1)), ' + '"suite", "ananke-quality:${1}", "instance", "(.*)")' ) ANANKE_SELECTOR = 'job="ananke-power"' ANANKE_UPS_DB_NAME = "Pyrphoros" @@ -1225,25 +1225,30 @@ def build_overview(): ) ) - hottest = [ + top_health_panels = [ (7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"), (8, "Hottest node: RAM", topk_with_node(node_mem_expr()), "percent"), (9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"), (10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"), + (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), + (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"), + (25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"), + (26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"), ] - for idx, (panel_id, title, expr, unit) in enumerate(hottest): + for idx, (panel_id, title, expr, unit) in enumerate(top_health_panels): + is_hottest_panel = panel_id in {7, 8, 9, 10} panels.append( stat_panel( panel_id, title, f"{expr}", - {"h": 2, "w": 6, "x": 6 * idx, "y": 5}, + {"h": 2, "w": 3, "x": 3 * idx, "y": 5}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, - text_mode="name_and_value", - legend="{{node}}", - instant=True, - links=link_to("atlas-nodes"), + text_mode="name_and_value" if is_hottest_panel else "value", + legend="{{node}}" if is_hottest_panel else None, + instant=is_hottest_panel, + links=link_to("atlas-storage" if panel_id in {23, 24, 25, 26} else "atlas-nodes"), ) ) @@ -1274,25 +1279,6 @@ def build_overview(): {"color": "green", "value": 98}, ], } - storage_panels = [ - (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), - (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"), - (25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"), - (26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"), - ] - for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): - panels.append( - stat_panel( - panel_id, - title, - expr, - {"h": 2, "w": 6, "x": 6 * idx, "y": 7}, - unit=unit, - thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, - links=link_to("atlas-storage"), - ) - ) - status_mapping = [ { "type": "value", @@ -1308,7 +1294,7 @@ def build_overview(): 40, "UPS Current Load", None, - {"h": 5, "w": 4, "x": 0, "y": 13}, + {"h": 6, "w": 4, "x": 0, "y": 12}, unit="none", decimals=1, text_mode="name_and_value", @@ -1355,7 +1341,7 @@ def build_overview(): 41, "UPS History (Power Draw)", None, - {"h": 5, "w": 4, "x": 4, "y": 13}, + {"h": 6, "w": 4, "x": 4, "y": 12}, unit="watt", targets=[ {"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB_SERIES, "legendFormat": ANANKE_UPS_DB_NAME}, @@ -1372,7 +1358,7 @@ def build_overview(): 42, "Current Climate", None, - {"h": 5, "w": 4, "x": 8, "y": 13}, + {"h": 6, "w": 4, "x": 8, "y": 12}, unit="none", decimals=2, text_mode="value", @@ -1395,7 +1381,7 @@ def build_overview(): 43, "Climate History", None, - {"h": 5, "w": 4, "x": 12, "y": 13}, + {"h": 6, "w": 4, "x": 12, "y": 12}, unit="celsius", targets=[ {"refId": "A", "expr": CLIMATE_TEMP_SERIES, "legendFormat": "Temperature (°C)"}, @@ -1422,14 +1408,14 @@ def build_overview(): 140, "Fan Activity", None, - {"h": 5, "w": 4, "x": 16, "y": 13}, + {"h": 6, "w": 4, "x": 16, "y": 12}, unit="none", decimals=0, text_mode="name_and_value", targets=[ {"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Outlet", "instant": True}, - {"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "Inside Inlet", "instant": True}, - {"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Outside Inlet", "instant": True}, + {"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "In Inlet", "instant": True}, + {"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Out Inlet", "instant": True}, {"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior", "instant": True}, ], thresholds={ @@ -1448,13 +1434,13 @@ def build_overview(): 141, "Fan History (0-10)", None, - {"h": 5, "w": 4, "x": 20, "y": 13}, + {"h": 6, "w": 4, "x": 20, "y": 12}, unit="none", max_value=10, targets=[ {"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Outlet"}, - {"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "Inside Inlet"}, - {"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Outside Inlet"}, + {"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "In Inlet"}, + {"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Out Inlet"}, {"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior"}, ], legend_display="list", @@ -1468,7 +1454,7 @@ def build_overview(): 44, "One-off Job Pods >1h", f"({ONEOFF_JOB_POD_AGE_HOURS}) > 1", - {"h": 4, "w": 6, "x": 0, "y": 9}, + {"h": 5, "w": 6, "x": 0, "y": 7}, unit="h", instant=True, transformations=[ @@ -1484,7 +1470,7 @@ def build_overview(): 45, "Ariadne Attempts (24h)", "sum(increase(ariadne_task_runs_total[24h]))", - {"h": 4, "w": 6, "x": 6, "y": 9}, + {"h": 5, "w": 6, "x": 6, "y": 7}, unit="none", decimals=0, links=link_to("atlas-jobs"), @@ -1494,14 +1480,13 @@ def build_overview(): 46, "Platform Test Success Rate", None, - {"h": 4, "w": 6, "x": 12, "y": 9}, + {"h": 5, "w": 6, "x": 12, "y": 7}, unit="percent", targets=[ { "refId": "A", - "datasource": {"type": "datasource", "uid": "-- Dashboard --"}, - "dashboardUid": "atlas-jobs", - "panelId": 19, + "expr": PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES, + "legendFormat": "{{suite}}", } ], legend_display="table", @@ -1509,16 +1494,17 @@ def build_overview(): legend_calcs=["last"], links=link_to("atlas-jobs"), ) - test_success["datasource"] = {"type": "datasource", "uid": "-- Dashboard --"} + test_success["fieldConfig"]["defaults"]["min"] = 0 + test_success["fieldConfig"]["defaults"]["max"] = 100 test_success["description"] = ( - "Sourced directly from Atlas Jobs internal panel #19 (atlas-jobs) so Overview reuses the exact same suite-level test stream." + "Per-test rolling pass rate (0-100) across Ariadne tasks, Metis build/flash node runs, and Ananke quality-gate runs." ) panels.append(test_success) test_failures = stat_panel( 47, "Platform Test Failures (24h)", TEST_FAILURES_24H_TOTAL, - {"h": 4, "w": 6, "x": 18, "y": 9}, + {"h": 5, "w": 6, "x": 18, "y": 7}, unit="none", decimals=0, instant=True, @@ -2993,8 +2979,10 @@ def build_jobs_dashboard(): legend_display="list", legend_placement="bottom", ) + suite_panel["fieldConfig"]["defaults"]["min"] = 0 + suite_panel["fieldConfig"]["defaults"]["max"] = 100 suite_panel["description"] = ( - "Per-suite pass percentage over time. Used by Atlas Overview and kept here for detailed triage." + "Per-test/per-node pass percentage over time across Ariadne, Metis, and Ananke quality suites." ) panels.append(suite_panel) @@ -3147,8 +3135,8 @@ def build_power_dashboard(): text_mode="name_and_value", targets=[ {"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Outlet", "instant": True}, - {"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "Inside Inlet", "instant": True}, - {"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Outside Inlet", "instant": True}, + {"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "In Inlet", "instant": True}, + {"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Out Inlet", "instant": True}, {"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior", "instant": True}, ], thresholds={ @@ -3172,8 +3160,8 @@ def build_power_dashboard(): max_value=10, targets=[ {"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Outlet"}, - {"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "Inside Inlet"}, - {"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Outside Inlet"}, + {"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "In Inlet"}, + {"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Out Inlet"}, {"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior"}, ], legend_display="table", diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 45ef7a68..94cfb6ab 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -1253,13 +1253,15 @@ "targets": [ { "refId": "A", - "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")", + "expr": "label_replace(100 * (sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (task) (increase(ariadne_task_runs_total[$__interval])), 1)), \"suite\", \"ariadne:${1}\", \"task\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_builds_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_builds_total[$__interval])), 1)), \"suite\", \"metis-build:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_flashes_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_flashes_total[$__interval])), 1)), \"suite\", \"metis-flash:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) / clamp_min(sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])), 1)), \"suite\", \"ananke-quality:${1}\", \"instance\", \"(.*)\")", "legendFormat": "{{suite}}" } ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "min": 0, + "max": 100 }, "overrides": [] }, @@ -1272,7 +1274,7 @@ "mode": "multi" } }, - "description": "Per-suite pass percentage over time. Used by Atlas Overview and kept here for detailed triage." + "description": "Per-test/per-node pass percentage over time across Ariadne, Metis, and Ananke quality suites." } ], "time": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index b3ae3ef0..8cd32cc9 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -504,7 +504,7 @@ }, "gridPos": { "h": 2, - "w": 6, + "w": 3, "x": 0, "y": 5 }, @@ -581,8 +581,8 @@ }, "gridPos": { "h": 2, - "w": 6, - "x": 6, + "w": 3, + "x": 3, "y": 5 }, "targets": [ @@ -658,8 +658,8 @@ }, "gridPos": { "h": 2, - "w": 6, - "x": 12, + "w": 3, + "x": 6, "y": 5 }, "targets": [ @@ -727,8 +727,8 @@ }, "gridPos": { "h": 2, - "w": 6, - "x": 18, + "w": 3, + "x": 9, "y": 5 }, "targets": [ @@ -796,9 +796,9 @@ }, "gridPos": { "h": 2, - "w": 6, - "x": 0, - "y": 7 + "w": 3, + "x": 12, + "y": 5 }, "targets": [ { @@ -871,9 +871,9 @@ }, "gridPos": { "h": 2, - "w": 6, - "x": 6, - "y": 7 + "w": 3, + "x": 15, + "y": 5 }, "targets": [ { @@ -946,9 +946,9 @@ }, "gridPos": { "h": 2, - "w": 6, - "x": 12, - "y": 7 + "w": 3, + "x": 18, + "y": 5 }, "targets": [ { @@ -1013,9 +1013,9 @@ }, "gridPos": { "h": 2, - "w": 6, - "x": 18, - "y": 7 + "w": 3, + "x": 21, + "y": 5 }, "targets": [ { @@ -1079,10 +1079,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 6, "w": 4, "x": 0, - "y": 13 + "y": 12 }, "targets": [ { @@ -1301,10 +1301,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 6, "w": 4, "x": 4, - "y": 13 + "y": 12 }, "targets": [ { @@ -1355,10 +1355,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 6, "w": 4, "x": 8, - "y": 13 + "y": 12 }, "targets": [ { @@ -1459,10 +1459,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 6, "w": 4, "x": 12, - "y": 13 + "y": 12 }, "targets": [ { @@ -1533,10 +1533,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 6, "w": 4, "x": 16, - "y": 13 + "y": 12 }, "targets": [ { @@ -1548,13 +1548,13 @@ { "refId": "B", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"inside_inlet\"}) or on() vector(0))", - "legendFormat": "Inside Inlet", + "legendFormat": "In Inlet", "instant": true }, { "refId": "C", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"outside_inlet\"}) or on() vector(0))", - "legendFormat": "Outside Inlet", + "legendFormat": "Out Inlet", "instant": true }, { @@ -1625,10 +1625,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 6, "w": 4, "x": 20, - "y": 13 + "y": 12 }, "targets": [ { @@ -1639,12 +1639,12 @@ { "refId": "B", "expr": "(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"} or atlas_climate_fan_activity_level{position=\"inside_inlet\"})", - "legendFormat": "Inside Inlet" + "legendFormat": "In Inlet" }, { "refId": "C", "expr": "(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"} or atlas_climate_fan_activity_level{position=\"outside_inlet\"})", - "legendFormat": "Outside Inlet" + "legendFormat": "Out Inlet" }, { "refId": "D", @@ -1685,10 +1685,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 4, + "h": 5, "w": 6, "x": 0, - "y": 9 + "y": 7 }, "targets": [ { @@ -1744,10 +1744,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 4, + "h": 5, "w": 6, "x": 6, - "y": 9 + "y": 7 }, "targets": [ { @@ -1808,29 +1808,27 @@ "type": "timeseries", "title": "Platform Test Success Rate", "datasource": { - "type": "datasource", - "uid": "-- Dashboard --" + "type": "prometheus", + "uid": "atlas-vm" }, "gridPos": { - "h": 4, + "h": 5, "w": 6, "x": 12, - "y": 9 + "y": 7 }, "targets": [ { "refId": "A", - "datasource": { - "type": "datasource", - "uid": "-- Dashboard --" - }, - "dashboardUid": "atlas-jobs", - "panelId": 19 + "expr": "label_replace(100 * (sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (task) (increase(ariadne_task_runs_total[$__interval])), 1)), \"suite\", \"ariadne:${1}\", \"task\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_builds_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_builds_total[$__interval])), 1)), \"suite\", \"metis-build:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_flashes_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_flashes_total[$__interval])), 1)), \"suite\", \"metis-flash:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) / clamp_min(sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])), 1)), \"suite\", \"ananke-quality:${1}\", \"instance\", \"(.*)\")", + "legendFormat": "{{suite}}" } ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "min": 0, + "max": 100 }, "overrides": [] }, @@ -1853,7 +1851,7 @@ "targetBlank": true } ], - "description": "Sourced directly from Atlas Jobs internal panel #19 (atlas-jobs) so Overview reuses the exact same suite-level test stream." + "description": "Per-test rolling pass rate (0-100) across Ariadne tasks, Metis build/flash node runs, and Ananke quality-gate runs." }, { "id": 47, @@ -1864,10 +1862,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 4, + "h": 5, "w": 6, "x": 18, - "y": 9 + "y": 7 }, "targets": [ { diff --git a/services/monitoring/dashboards/atlas-power.json b/services/monitoring/dashboards/atlas-power.json index 1e8df3b8..1595f417 100644 --- a/services/monitoring/dashboards/atlas-power.json +++ b/services/monitoring/dashboards/atlas-power.json @@ -454,13 +454,13 @@ { "refId": "B", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"inside_inlet\"}) or on() vector(0))", - "legendFormat": "Inside Inlet", + "legendFormat": "In Inlet", "instant": true }, { "refId": "C", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"outside_inlet\"}) or on() vector(0))", - "legendFormat": "Outside Inlet", + "legendFormat": "Out Inlet", "instant": true }, { @@ -539,12 +539,12 @@ { "refId": "B", "expr": "(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"} or atlas_climate_fan_activity_level{position=\"inside_inlet\"})", - "legendFormat": "Inside Inlet" + "legendFormat": "In Inlet" }, { "refId": "C", "expr": "(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"} or atlas_climate_fan_activity_level{position=\"outside_inlet\"})", - "legendFormat": "Outside Inlet" + "legendFormat": "Out Inlet" }, { "refId": "D", diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 4ebf068f..e4f0fb4d 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -1262,13 +1262,15 @@ data: "targets": [ { "refId": "A", - "expr": "label_replace(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ariadne_task_runs_total[$__interval])) or on() vector(0)), 1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_builds_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-build\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(metis_flashes_total[$__interval])) or on() vector(0)), 1), \"suite\", \"metis-flash\", \"__name__\", \".*\") or label_replace(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) or on() vector(0)) / clamp_min((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])) or on() vector(0)), 1), \"suite\", \"ananke-quality\", \"__name__\", \".*\")", + "expr": "label_replace(100 * (sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (task) (increase(ariadne_task_runs_total[$__interval])), 1)), \"suite\", \"ariadne:${1}\", \"task\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_builds_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_builds_total[$__interval])), 1)), \"suite\", \"metis-build:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_flashes_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_flashes_total[$__interval])), 1)), \"suite\", \"metis-flash:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) / clamp_min(sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])), 1)), \"suite\", \"ananke-quality:${1}\", \"instance\", \"(.*)\")", "legendFormat": "{{suite}}" } ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "min": 0, + "max": 100 }, "overrides": [] }, @@ -1281,7 +1283,7 @@ data: "mode": "multi" } }, - "description": "Per-suite pass percentage over time. Used by Atlas Overview and kept here for detailed triage." + "description": "Per-test/per-node pass percentage over time across Ariadne, Metis, and Ananke quality suites." } ], "time": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index d5e9ae93..9f30d15c 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -513,7 +513,7 @@ data: }, "gridPos": { "h": 2, - "w": 6, + "w": 3, "x": 0, "y": 5 }, @@ -590,8 +590,8 @@ data: }, "gridPos": { "h": 2, - "w": 6, - "x": 6, + "w": 3, + "x": 3, "y": 5 }, "targets": [ @@ -667,8 +667,8 @@ data: }, "gridPos": { "h": 2, - "w": 6, - "x": 12, + "w": 3, + "x": 6, "y": 5 }, "targets": [ @@ -736,8 +736,8 @@ data: }, "gridPos": { "h": 2, - "w": 6, - "x": 18, + "w": 3, + "x": 9, "y": 5 }, "targets": [ @@ -805,9 +805,9 @@ data: }, "gridPos": { "h": 2, - "w": 6, - "x": 0, - "y": 7 + "w": 3, + "x": 12, + "y": 5 }, "targets": [ { @@ -880,9 +880,9 @@ data: }, "gridPos": { "h": 2, - "w": 6, - "x": 6, - "y": 7 + "w": 3, + "x": 15, + "y": 5 }, "targets": [ { @@ -955,9 +955,9 @@ data: }, "gridPos": { "h": 2, - "w": 6, - "x": 12, - "y": 7 + "w": 3, + "x": 18, + "y": 5 }, "targets": [ { @@ -1022,9 +1022,9 @@ data: }, "gridPos": { "h": 2, - "w": 6, - "x": 18, - "y": 7 + "w": 3, + "x": 21, + "y": 5 }, "targets": [ { @@ -1088,10 +1088,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 6, "w": 4, "x": 0, - "y": 13 + "y": 12 }, "targets": [ { @@ -1310,10 +1310,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 6, "w": 4, "x": 4, - "y": 13 + "y": 12 }, "targets": [ { @@ -1364,10 +1364,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 6, "w": 4, "x": 8, - "y": 13 + "y": 12 }, "targets": [ { @@ -1468,10 +1468,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 6, "w": 4, "x": 12, - "y": 13 + "y": 12 }, "targets": [ { @@ -1542,10 +1542,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 6, "w": 4, "x": 16, - "y": 13 + "y": 12 }, "targets": [ { @@ -1557,13 +1557,13 @@ data: { "refId": "B", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"inside_inlet\"}) or on() vector(0))", - "legendFormat": "Inside Inlet", + "legendFormat": "In Inlet", "instant": true }, { "refId": "C", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"outside_inlet\"}) or on() vector(0))", - "legendFormat": "Outside Inlet", + "legendFormat": "Out Inlet", "instant": true }, { @@ -1634,10 +1634,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 6, "w": 4, "x": 20, - "y": 13 + "y": 12 }, "targets": [ { @@ -1648,12 +1648,12 @@ data: { "refId": "B", "expr": "(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"} or atlas_climate_fan_activity_level{position=\"inside_inlet\"})", - "legendFormat": "Inside Inlet" + "legendFormat": "In Inlet" }, { "refId": "C", "expr": "(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"} or atlas_climate_fan_activity_level{position=\"outside_inlet\"})", - "legendFormat": "Outside Inlet" + "legendFormat": "Out Inlet" }, { "refId": "D", @@ -1694,10 +1694,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 4, + "h": 5, "w": 6, "x": 0, - "y": 9 + "y": 7 }, "targets": [ { @@ -1753,10 +1753,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 4, + "h": 5, "w": 6, "x": 6, - "y": 9 + "y": 7 }, "targets": [ { @@ -1817,29 +1817,27 @@ data: "type": "timeseries", "title": "Platform Test Success Rate", "datasource": { - "type": "datasource", - "uid": "-- Dashboard --" + "type": "prometheus", + "uid": "atlas-vm" }, "gridPos": { - "h": 4, + "h": 5, "w": 6, "x": 12, - "y": 9 + "y": 7 }, "targets": [ { "refId": "A", - "datasource": { - "type": "datasource", - "uid": "-- Dashboard --" - }, - "dashboardUid": "atlas-jobs", - "panelId": 19 + "expr": "label_replace(100 * (sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (task) (increase(ariadne_task_runs_total[$__interval])), 1)), \"suite\", \"ariadne:${1}\", \"task\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_builds_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_builds_total[$__interval])), 1)), \"suite\", \"metis-build:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_flashes_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_flashes_total[$__interval])), 1)), \"suite\", \"metis-flash:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) / clamp_min(sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])), 1)), \"suite\", \"ananke-quality:${1}\", \"instance\", \"(.*)\")", + "legendFormat": "{{suite}}" } ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "min": 0, + "max": 100 }, "overrides": [] }, @@ -1862,7 +1860,7 @@ data: "targetBlank": true } ], - "description": "Sourced directly from Atlas Jobs internal panel #19 (atlas-jobs) so Overview reuses the exact same suite-level test stream." + "description": "Per-test rolling pass rate (0-100) across Ariadne tasks, Metis build/flash node runs, and Ananke quality-gate runs." }, { "id": 47, @@ -1873,10 +1871,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 4, + "h": 5, "w": 6, "x": 18, - "y": 9 + "y": 7 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-power.yaml b/services/monitoring/grafana-dashboard-power.yaml index 294bd75a..e2b9b172 100644 --- a/services/monitoring/grafana-dashboard-power.yaml +++ b/services/monitoring/grafana-dashboard-power.yaml @@ -463,13 +463,13 @@ data: { "refId": "B", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"inside_inlet\"}) or on() vector(0))", - "legendFormat": "Inside Inlet", + "legendFormat": "In Inlet", "instant": true }, { "refId": "C", "expr": "round(max(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"}) or max(atlas_climate_fan_activity_level{position=\"outside_inlet\"}) or on() vector(0))", - "legendFormat": "Outside Inlet", + "legendFormat": "Out Inlet", "instant": true }, { @@ -548,12 +548,12 @@ data: { "refId": "B", "expr": "(atlas_climate_fan_activity_level{fan_group=\"inside_inlet\"} or atlas_climate_fan_activity_level{position=\"inside_inlet\"})", - "legendFormat": "Inside Inlet" + "legendFormat": "In Inlet" }, { "refId": "C", "expr": "(atlas_climate_fan_activity_level{fan_group=\"outside_inlet\"} or atlas_climate_fan_activity_level{position=\"outside_inlet\"})", - "legendFormat": "Outside Inlet" + "legendFormat": "Out Inlet" }, { "refId": "D",