From 23146aaa8a6c5c221c92f530c35edc67294a8294 Mon Sep 17 00:00:00 2001
From: jenkins <jenkins@bstein.dev>
Date: Wed, 22 Apr 2026 14:34:40 -0300
Subject: [PATCH] monitoring(testing): clean canonical suite rows

---
 scripts/dashboards_render_atlas.py            |  73 ++++---
 .../monitoring/dashboards/atlas-jobs.json     | 191 ++++++++++--------
 .../monitoring/dashboards/atlas-testing.json  | 191 ++++++++++--------
 .../monitoring/grafana-dashboard-jobs.yaml    | 191 ++++++++++--------
 .../monitoring/grafana-dashboard-testing.yaml | 191 ++++++++++--------
 5 files changed, 462 insertions(+), 375 deletions(-)

diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py
index a65b48c5..ba8cd64b 100644
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@@ -529,7 +529,7 @@ PLATFORM_TEST_SUITE_MATCHER = "|".join(
     PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite) for suite in PLATFORM_TEST_SUITE_NAMES
 )
 PLATFORM_TEST_SUITE_CANONICAL_MATCHER = "|".join(PLATFORM_TEST_SUITE_NAMES)
-PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER = PLATFORM_TEST_SUITE_MATCHER
+PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER = PLATFORM_TEST_SUITE_CANONICAL_MATCHER
 PLATFORM_TEST_SUCCESS_EVENTS_30D = (
     f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[30d])) or on() vector(0))'
 )
@@ -1098,15 +1098,12 @@ def testing_suite_variable():
     options = [
         {
             "text": suite,
-            "value": PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite),
+            "value": suite,
             "selected": False,
         }
         for suite in PLATFORM_TEST_SUITE_NAMES
     ]
-    query = ",".join(
-        f"{suite} : {PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite)}"
-        for suite in PLATFORM_TEST_SUITE_NAMES
-    )
+    query = ",".join(f"{suite} : {suite}" for suite in PLATFORM_TEST_SUITE_NAMES)
     return {
         "name": "suite",
         "label": "Suite",
@@ -3111,13 +3108,24 @@ def build_jobs_dashboard():
     )
     success_rate_24h = f"100 * ({success_24h}) / clamp_min(({runs_24h}), 1)"
     success_rate_30d = f"100 * ({success_30d}) / clamp_min(({runs_30d}), 1)"
+    runs_by_suite_24h = f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[24h]))'
+    success_by_suite_24h = (
+        f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[24h]))'
+    )
     success_rate_by_suite_24h = (
-        f'sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[24h]))) '
-        f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[24h]))), 1))'
+        f'sort_desc(((100 * ({success_by_suite_24h}) / clamp_min(({runs_by_suite_24h}), 1)) '
+        f'and on(suite) (({runs_by_suite_24h}) > 0)) '
+        f'or on(suite) ((0 * ({runs_by_suite_24h})) - 1))'
     )
     failures_by_suite_24h = (
         f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_failure_selector}}}[24h]))'
     )
+    non_failure = PLATFORM_TEST_NON_FAILURE_STATUS
+    current_gate_health_by_suite = (
+        f'(100 * sum by (suite) (max by (suite, check) (({{{checks_selector},result=~"{non_failure}"}} > bool 0))) '
+        f'/ clamp_min(sum by (suite) (max by (suite, check) (({{{checks_selector}}} > bool 0))), 1)) '
+        f'or on(suite) ({selected_suite_zero})'
+    )
     success_history_by_suite = (
         f'100 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[$__interval])) '
         f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[$__interval]))), 1))'
@@ -3137,7 +3145,6 @@ def build_jobs_dashboard():
     average_coverage = f"(avg(({coverage_by_suite})) or on() vector(0))"
     suites_loc_violating = f'(sum((({smell_by_suite}) > bool 0)) or on() vector(0))'
 
-    non_failure = PLATFORM_TEST_NON_FAILURE_STATUS
     checks_failed_total = f'(sum({{{checks_selector},result!~"{non_failure}"}}) or on() vector(0))'
     checks_failed_tests = (
         f'(sum(count by (suite) ({{{checks_selector},check=~"tests|unit|build",result!~"{non_failure}"}})) or on() vector(0))'
@@ -3284,7 +3291,7 @@ def build_jobs_dashboard():
     panels.append(
         stat_panel(
             2,
-            "Success Rate (24h)",
+            "Run Reliability (24h)",
             success_rate_24h,
             {"h": 5, "w": 4, "x": 0, "y": 0},
             unit="percent",
@@ -3296,7 +3303,7 @@ def build_jobs_dashboard():
     panels.append(
         stat_panel(
             3,
-            "Success Rate (30d)",
+            "Run Reliability (30d)",
             success_rate_30d,
             {"h": 5, "w": 4, "x": 4, "y": 0},
             unit="percent",
@@ -3308,7 +3315,7 @@ def build_jobs_dashboard():
     panels.append(
         stat_panel(
             4,
-            "Failures (24h)",
+            "Failed Runs (24h)",
             failures_24h,
             {"h": 5, "w": 4, "x": 8, "y": 0},
             unit="none",
@@ -3357,21 +3364,9 @@ def build_jobs_dashboard():
     panels.append(
         bargauge_panel(
             8,
-            "Failures by Suite (24h)",
-            failures_by_suite_24h,
+            "Current Gate Health by Suite",
+            current_gate_health_by_suite,
             {"h": 8, "w": 8, "x": 0, "y": 5},
-            unit="none",
-            instant=True,
-            legend="{{suite}}",
-            thresholds=failures_thresholds,
-        )
-    )
-    panels.append(
-        bargauge_panel(
-            9,
-            "Success Rate by Suite (24h)",
-            success_rate_by_suite_24h,
-            {"h": 8, "w": 8, "x": 8, "y": 5},
             unit="percent",
             instant=True,
             legend="{{suite}}",
@@ -3380,6 +3375,30 @@ def build_jobs_dashboard():
             decimals=2,
         )
     )
+    panels[-1]["description"] = (
+        "Current pass percentage across the required gate dimensions reported by each suite. "
+        "This is the fastest place to answer whether the latest suite quality signal is healthy."
+    )
+    reliability_suite_panel = bargauge_panel(
+        9,
+        "Run Reliability by Suite (24h)",
+        success_rate_by_suite_24h,
+        {"h": 8, "w": 8, "x": 8, "y": 5},
+        unit="percent",
+        instant=True,
+        legend="{{suite}}",
+        sort_order="asc",
+        thresholds=success_thresholds,
+        decimals=2,
+    )
+    reliability_suite_panel["description"] = (
+        "Rolling CI run success rate. This can stay low after failed/debug runs even when "
+        "Current Gate Health is green."
+    )
+    reliability_suite_panel["fieldConfig"]["defaults"]["mappings"] = [
+        {"type": "value", "options": {"-1": {"text": "no runs"}}}
+    ]
+    panels.append(reliability_suite_panel)
     coverage_gap_panel = bargauge_panel(
         10,
         "Coverage Gap to 95% by Suite",
@@ -3397,7 +3416,7 @@ def build_jobs_dashboard():
 
     history_panel = timeseries_panel(
         11,
-        "Success History by Suite",
+        "Run Reliability History by Suite",
         success_history_by_suite,
         {"h": 8, "w": 24, "x": 0, "y": 13},
         unit="percent",
diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json
index 1a38da09..aaa00cb3 100644
--- a/services/monitoring/dashboards/atlas-jobs.json
+++ b/services/monitoring/dashboards/atlas-jobs.json
@@ -7,7 +7,7 @@
     {
       "id": 2,
       "type": "stat",
-      "title": "Success Rate (24h)",
+      "title": "Run Reliability (24h)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -81,7 +81,7 @@
     {
       "id": 3,
       "type": "stat",
-      "title": "Success Rate (30d)",
+      "title": "Run Reliability (30d)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -155,7 +155,7 @@
     {
       "id": 4,
       "type": "stat",
-      "title": "Failures (24h)",
+      "title": "Failed Runs (24h)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -432,7 +432,7 @@
     {
       "id": 8,
       "type": "bargauge",
-      "title": "Failures by Suite (24h)",
+      "title": "Current Gate Health by Suite",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -445,81 +445,7 @@
       },
       "targets": [
         {
-          "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status!~\"ok|passed|success\"}[24h])))",
-          "refId": "A",
-          "legendFormat": "{{suite}}",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "none",
-          "min": 0,
-          "max": null,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 1
-              },
-              {
-                "color": "orange",
-                "value": 3
-              },
-              {
-                "color": "red",
-                "value": 5
-              }
-            ]
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "displayMode": "gradient",
-        "orientation": "horizontal",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        }
-      },
-      "transformations": [
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
-    },
-    {
-      "id": 9,
-      "type": "bargauge",
-      "title": "Success Rate by Suite (24h)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 8,
-        "x": 8,
-        "y": 5
-      },
-      "targets": [
-        {
-          "expr": "sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1))",
+          "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))",
           "refId": "A",
           "legendFormat": "{{suite}}",
           "instant": true
@@ -580,7 +506,98 @@
             "order": "asc"
           }
         }
-      ]
+      ],
+      "description": "Current pass percentage across the required gate dimensions reported by each suite. This is the fastest place to answer whether the latest suite quality signal is healthy."
+    },
+    {
+      "id": 9,
+      "type": "bargauge",
+      "title": "Run Reliability by Suite (24h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 8,
+        "y": 5
+      },
+      "targets": [
+        {
+          "expr": "sort_desc(((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))) > 0)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h])))) - 1))",
+          "refId": "A",
+          "legendFormat": "{{suite}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 90
+              },
+              {
+                "color": "yellow",
+                "value": 93
+              },
+              {
+                "color": "green",
+                "value": 95
+              },
+              {
+                "color": "blue",
+                "value": 100
+              }
+            ]
+          },
+          "decimals": 2,
+          "mappings": [
+            {
+              "type": "value",
+              "options": {
+                "-1": {
+                  "text": "no runs"
+                }
+              }
+            }
+          ]
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "asc"
+          }
+        }
+      ],
+      "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green."
     },
     {
       "id": 10,
@@ -661,7 +678,7 @@
     {
       "id": 11,
       "type": "timeseries",
-      "title": "Success History by Suite",
+      "title": "Run Reliability History by Suite",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -3380,7 +3397,7 @@
         "name": "suite",
         "label": "Suite",
         "type": "custom",
-        "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus|pegasus-health|pegasus_health,soteria : soteria,titan_iac : titan_iac|titan-iac,bstein_home : bstein_home|bstein-home,data_prepper : data_prepper|data-prepper",
+        "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper",
         "current": {
           "text": "All",
           "value": "$__all",
@@ -3409,7 +3426,7 @@
           },
           {
             "text": "pegasus",
-            "value": "pegasus|pegasus-health|pegasus_health",
+            "value": "pegasus",
             "selected": false
           },
           {
@@ -3419,24 +3436,24 @@
           },
           {
             "text": "titan_iac",
-            "value": "titan_iac|titan-iac",
+            "value": "titan_iac",
             "selected": false
           },
           {
             "text": "bstein_home",
-            "value": "bstein_home|bstein-home",
+            "value": "bstein_home",
             "selected": false
           },
           {
             "text": "data_prepper",
-            "value": "data_prepper|data-prepper",
+            "value": "data_prepper",
             "selected": false
           }
         ],
         "hide": 0,
         "multi": false,
         "includeAll": true,
-        "allValue": "ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper",
+        "allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper",
         "refresh": 1,
         "sort": 1,
         "skipUrlSync": false
diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json
index 8fb5b8ed..71ccae86 100644
--- a/services/monitoring/dashboards/atlas-testing.json
+++ b/services/monitoring/dashboards/atlas-testing.json
@@ -7,7 +7,7 @@
     {
       "id": 2,
       "type": "stat",
-      "title": "Success Rate (24h)",
+      "title": "Run Reliability (24h)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -81,7 +81,7 @@
     {
       "id": 3,
       "type": "stat",
-      "title": "Success Rate (30d)",
+      "title": "Run Reliability (30d)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -155,7 +155,7 @@
     {
       "id": 4,
       "type": "stat",
-      "title": "Failures (24h)",
+      "title": "Failed Runs (24h)",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -432,7 +432,7 @@
     {
       "id": 8,
       "type": "bargauge",
-      "title": "Failures by Suite (24h)",
+      "title": "Current Gate Health by Suite",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -445,81 +445,7 @@
       },
       "targets": [
         {
-          "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status!~\"ok|passed|success\"}[24h])))",
-          "refId": "A",
-          "legendFormat": "{{suite}}",
-          "instant": true
-        }
-      ],
-      "fieldConfig": {
-        "defaults": {
-          "unit": "none",
-          "min": 0,
-          "max": null,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 1
-              },
-              {
-                "color": "orange",
-                "value": 3
-              },
-              {
-                "color": "red",
-                "value": 5
-              }
-            ]
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "displayMode": "gradient",
-        "orientation": "horizontal",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        }
-      },
-      "transformations": [
-        {
-          "id": "sortBy",
-          "options": {
-            "fields": [
-              "Value"
-            ],
-            "order": "desc"
-          }
-        }
-      ]
-    },
-    {
-      "id": 9,
-      "type": "bargauge",
-      "title": "Success Rate by Suite (24h)",
-      "datasource": {
-        "type": "prometheus",
-        "uid": "atlas-vm"
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 8,
-        "x": 8,
-        "y": 5
-      },
-      "targets": [
-        {
-          "expr": "sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1))",
+          "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))",
           "refId": "A",
           "legendFormat": "{{suite}}",
           "instant": true
@@ -580,7 +506,98 @@
             "order": "asc"
           }
         }
-      ]
+      ],
+      "description": "Current pass percentage across the required gate dimensions reported by each suite. This is the fastest place to answer whether the latest suite quality signal is healthy."
+    },
+    {
+      "id": 9,
+      "type": "bargauge",
+      "title": "Run Reliability by Suite (24h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 8,
+        "y": 5
+      },
+      "targets": [
+        {
+          "expr": "sort_desc(((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))) > 0)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h])))) - 1))",
+          "refId": "A",
+          "legendFormat": "{{suite}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 90
+              },
+              {
+                "color": "yellow",
+                "value": 93
+              },
+              {
+                "color": "green",
+                "value": 95
+              },
+              {
+                "color": "blue",
+                "value": 100
+              }
+            ]
+          },
+          "decimals": 2,
+          "mappings": [
+            {
+              "type": "value",
+              "options": {
+                "-1": {
+                  "text": "no runs"
+                }
+              }
+            }
+          ]
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "asc"
+          }
+        }
+      ],
+      "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green."
     },
     {
       "id": 10,
@@ -661,7 +678,7 @@
     {
       "id": 11,
       "type": "timeseries",
-      "title": "Success History by Suite",
+      "title": "Run Reliability History by Suite",
       "datasource": {
         "type": "prometheus",
         "uid": "atlas-vm"
@@ -3380,7 +3397,7 @@
         "name": "suite",
         "label": "Suite",
         "type": "custom",
-        "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus|pegasus-health|pegasus_health,soteria : soteria,titan_iac : titan_iac|titan-iac,bstein_home : bstein_home|bstein-home,data_prepper : data_prepper|data-prepper",
+        "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper",
         "current": {
           "text": "All",
           "value": "$__all",
@@ -3409,7 +3426,7 @@
           },
           {
             "text": "pegasus",
-            "value": "pegasus|pegasus-health|pegasus_health",
+            "value": "pegasus",
             "selected": false
           },
           {
@@ -3419,24 +3436,24 @@
           },
           {
             "text": "titan_iac",
-            "value": "titan_iac|titan-iac",
+            "value": "titan_iac",
             "selected": false
           },
           {
             "text": "bstein_home",
-            "value": "bstein_home|bstein-home",
+            "value": "bstein_home",
             "selected": false
           },
           {
             "text": "data_prepper",
-            "value": "data_prepper|data-prepper",
+            "value": "data_prepper",
             "selected": false
           }
         ],
         "hide": 0,
         "multi": false,
         "includeAll": true,
-        "allValue": "ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper",
+        "allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper",
         "refresh": 1,
         "sort": 1,
         "skipUrlSync": false
diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml
index 135d992f..929b4744 100644
--- a/services/monitoring/grafana-dashboard-jobs.yaml
+++ b/services/monitoring/grafana-dashboard-jobs.yaml
@@ -16,7 +16,7 @@ data:
         {
           "id": 2,
           "type": "stat",
-          "title": "Success Rate (24h)",
+          "title": "Run Reliability (24h)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -90,7 +90,7 @@ data:
         {
           "id": 3,
           "type": "stat",
-          "title": "Success Rate (30d)",
+          "title": "Run Reliability (30d)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -164,7 +164,7 @@ data:
         {
           "id": 4,
           "type": "stat",
-          "title": "Failures (24h)",
+          "title": "Failed Runs (24h)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -441,7 +441,7 @@ data:
         {
           "id": 8,
           "type": "bargauge",
-          "title": "Failures by Suite (24h)",
+          "title": "Current Gate Health by Suite",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -454,81 +454,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status!~\"ok|passed|success\"}[24h])))",
-              "refId": "A",
-              "legendFormat": "{{suite}}",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "none",
-              "min": 0,
-              "max": null,
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "green",
-                    "value": null
-                  },
-                  {
-                    "color": "yellow",
-                    "value": 1
-                  },
-                  {
-                    "color": "orange",
-                    "value": 3
-                  },
-                  {
-                    "color": "red",
-                    "value": 5
-                  }
-                ]
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "displayMode": "gradient",
-            "orientation": "horizontal",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            }
-          },
-          "transformations": [
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
-        },
-        {
-          "id": 9,
-          "type": "bargauge",
-          "title": "Success Rate by Suite (24h)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 8,
-            "w": 8,
-            "x": 8,
-            "y": 5
-          },
-          "targets": [
-            {
-              "expr": "sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1))",
+              "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))",
               "refId": "A",
               "legendFormat": "{{suite}}",
               "instant": true
@@ -589,7 +515,98 @@ data:
                 "order": "asc"
               }
             }
-          ]
+          ],
+          "description": "Current pass percentage across the required gate dimensions reported by each suite. This is the fastest place to answer whether the latest suite quality signal is healthy."
+        },
+        {
+          "id": 9,
+          "type": "bargauge",
+          "title": "Run Reliability by Suite (24h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 8,
+            "x": 8,
+            "y": 5
+          },
+          "targets": [
+            {
+              "expr": "sort_desc(((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))) > 0)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h])))) - 1))",
+              "refId": "A",
+              "legendFormat": "{{suite}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "min": 0,
+              "max": 100,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 90
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 93
+                  },
+                  {
+                    "color": "green",
+                    "value": 95
+                  },
+                  {
+                    "color": "blue",
+                    "value": 100
+                  }
+                ]
+              },
+              "decimals": 2,
+              "mappings": [
+                {
+                  "type": "value",
+                  "options": {
+                    "-1": {
+                      "text": "no runs"
+                    }
+                  }
+                }
+              ]
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "asc"
+              }
+            }
+          ],
+          "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green."
         },
         {
           "id": 10,
@@ -670,7 +687,7 @@ data:
         {
           "id": 11,
           "type": "timeseries",
-          "title": "Success History by Suite",
+          "title": "Run Reliability History by Suite",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -3389,7 +3406,7 @@ data:
             "name": "suite",
             "label": "Suite",
             "type": "custom",
-            "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus|pegasus-health|pegasus_health,soteria : soteria,titan_iac : titan_iac|titan-iac,bstein_home : bstein_home|bstein-home,data_prepper : data_prepper|data-prepper",
+            "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper",
             "current": {
               "text": "All",
               "value": "$__all",
@@ -3418,7 +3435,7 @@ data:
               },
               {
                 "text": "pegasus",
-                "value": "pegasus|pegasus-health|pegasus_health",
+                "value": "pegasus",
                 "selected": false
               },
               {
@@ -3428,24 +3445,24 @@ data:
               },
               {
                 "text": "titan_iac",
-                "value": "titan_iac|titan-iac",
+                "value": "titan_iac",
                 "selected": false
               },
               {
                 "text": "bstein_home",
-                "value": "bstein_home|bstein-home",
+                "value": "bstein_home",
                 "selected": false
               },
               {
                 "text": "data_prepper",
-                "value": "data_prepper|data-prepper",
+                "value": "data_prepper",
                 "selected": false
               }
             ],
             "hide": 0,
             "multi": false,
             "includeAll": true,
-            "allValue": "ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper",
+            "allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper",
             "refresh": 1,
             "sort": 1,
             "skipUrlSync": false
diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml
index d6371f1f..401901da 100644
--- a/services/monitoring/grafana-dashboard-testing.yaml
+++ b/services/monitoring/grafana-dashboard-testing.yaml
@@ -16,7 +16,7 @@ data:
         {
           "id": 2,
           "type": "stat",
-          "title": "Success Rate (24h)",
+          "title": "Run Reliability (24h)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -90,7 +90,7 @@ data:
         {
           "id": 3,
           "type": "stat",
-          "title": "Success Rate (30d)",
+          "title": "Run Reliability (30d)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -164,7 +164,7 @@ data:
         {
           "id": 4,
           "type": "stat",
-          "title": "Failures (24h)",
+          "title": "Failed Runs (24h)",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -441,7 +441,7 @@ data:
         {
           "id": 8,
           "type": "bargauge",
-          "title": "Failures by Suite (24h)",
+          "title": "Current Gate Health by Suite",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -454,81 +454,7 @@ data:
           },
           "targets": [
             {
-              "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status!~\"ok|passed|success\"}[24h])))",
-              "refId": "A",
-              "legendFormat": "{{suite}}",
-              "instant": true
-            }
-          ],
-          "fieldConfig": {
-            "defaults": {
-              "unit": "none",
-              "min": 0,
-              "max": null,
-              "thresholds": {
-                "mode": "absolute",
-                "steps": [
-                  {
-                    "color": "green",
-                    "value": null
-                  },
-                  {
-                    "color": "yellow",
-                    "value": 1
-                  },
-                  {
-                    "color": "orange",
-                    "value": 3
-                  },
-                  {
-                    "color": "red",
-                    "value": 5
-                  }
-                ]
-              }
-            },
-            "overrides": []
-          },
-          "options": {
-            "displayMode": "gradient",
-            "orientation": "horizontal",
-            "reduceOptions": {
-              "calcs": [
-                "lastNotNull"
-              ],
-              "fields": "",
-              "values": false
-            }
-          },
-          "transformations": [
-            {
-              "id": "sortBy",
-              "options": {
-                "fields": [
-                  "Value"
-                ],
-                "order": "desc"
-              }
-            }
-          ]
-        },
-        {
-          "id": 9,
-          "type": "bargauge",
-          "title": "Success Rate by Suite (24h)",
-          "datasource": {
-            "type": "prometheus",
-            "uid": "atlas-vm"
-          },
-          "gridPos": {
-            "h": 8,
-            "w": 8,
-            "x": 8,
-            "y": 5
-          },
-          "targets": [
-            {
-              "expr": "sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1))",
+              "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))",
               "refId": "A",
               "legendFormat": "{{suite}}",
               "instant": true
@@ -589,7 +515,98 @@ data:
                 "order": "asc"
               }
             }
-          ]
+          ],
+          "description": "Current pass percentage across the required gate dimensions reported by each suite. This is the fastest place to answer whether the latest suite quality signal is healthy."
+        },
+        {
+          "id": 9,
+          "type": "bargauge",
+          "title": "Run Reliability by Suite (24h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 8,
+            "x": 8,
+            "y": 5
+          },
+          "targets": [
+            {
+              "expr": "sort_desc(((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))) > 0)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h])))) - 1))",
+              "refId": "A",
+              "legendFormat": "{{suite}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "min": 0,
+              "max": 100,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 90
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 93
+                  },
+                  {
+                    "color": "green",
+                    "value": 95
+                  },
+                  {
+                    "color": "blue",
+                    "value": 100
+                  }
+                ]
+              },
+              "decimals": 2,
+              "mappings": [
+                {
+                  "type": "value",
+                  "options": {
+                    "-1": {
+                      "text": "no runs"
+                    }
+                  }
+                }
+              ]
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "asc"
+              }
+            }
+          ],
+          "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green."
         },
         {
           "id": 10,
@@ -670,7 +687,7 @@ data:
         {
           "id": 11,
           "type": "timeseries",
-          "title": "Success History by Suite",
+          "title": "Run Reliability History by Suite",
           "datasource": {
             "type": "prometheus",
             "uid": "atlas-vm"
@@ -3389,7 +3406,7 @@ data:
             "name": "suite",
             "label": "Suite",
             "type": "custom",
-            "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus|pegasus-health|pegasus_health,soteria : soteria,titan_iac : titan_iac|titan-iac,bstein_home : bstein_home|bstein-home,data_prepper : data_prepper|data-prepper",
+            "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper",
             "current": {
               "text": "All",
               "value": "$__all",
@@ -3418,7 +3435,7 @@ data:
               },
               {
                 "text": "pegasus",
-                "value": "pegasus|pegasus-health|pegasus_health",
+                "value": "pegasus",
                 "selected": false
               },
               {
@@ -3428,24 +3445,24 @@ data:
               },
               {
                 "text": "titan_iac",
-                "value": "titan_iac|titan-iac",
+                "value": "titan_iac",
                 "selected": false
               },
               {
                 "text": "bstein_home",
-                "value": "bstein_home|bstein-home",
+                "value": "bstein_home",
                 "selected": false
               },
               {
                 "text": "data_prepper",
-                "value": "data_prepper|data-prepper",
+                "value": "data_prepper",
                 "selected": false
               }
             ],
             "hide": 0,
             "multi": false,
             "includeAll": true,
-            "allValue": "ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper",
+            "allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper",
             "refresh": 1,
             "sort": 1,
             "skipUrlSync": false