monitoring(testing): clean canonical suite rows

2026-04-22 14:34:40 -03:00 · 2026-04-22 14:34:40 -03:00 · 23146aaa8a
commit 23146aaa8a
parent cc757ba082
5 changed files with 462 additions and 375 deletions
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -529,7 +529,7 @@ PLATFORM_TEST_SUITE_MATCHER = "|".join(
    PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite) for suite in PLATFORM_TEST_SUITE_NAMES
 )
 PLATFORM_TEST_SUITE_CANONICAL_MATCHER = "|".join(PLATFORM_TEST_SUITE_NAMES)
-PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER = PLATFORM_TEST_SUITE_MATCHER
+PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER = PLATFORM_TEST_SUITE_CANONICAL_MATCHER
 PLATFORM_TEST_SUCCESS_EVENTS_30D = (
    f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[30d])) or on() vector(0))'
 )
@ -1098,15 +1098,12 @@ def testing_suite_variable():
    options = [
        {
            "text": suite,
-            "value": PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite),
+            "value": suite,
            "selected": False,
        }
        for suite in PLATFORM_TEST_SUITE_NAMES
    ]
-    query = ",".join(
+    query = ",".join(f"{suite} : {suite}" for suite in PLATFORM_TEST_SUITE_NAMES)
        f"{suite} : {PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite)}"
        for suite in PLATFORM_TEST_SUITE_NAMES
    )
    return {
        "name": "suite",
        "label": "Suite",
@ -3111,13 +3108,24 @@ def build_jobs_dashboard():
    )
    success_rate_24h = f"100 * ({success_24h}) / clamp_min(({runs_24h}), 1)"
    success_rate_30d = f"100 * ({success_30d}) / clamp_min(({runs_30d}), 1)"
    runs_by_suite_24h = f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[24h]))'
    success_by_suite_24h = (
        f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[24h]))'
    )
    success_rate_by_suite_24h = (
-        f'sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[24h]))) '
+        f'sort_desc(((100 * ({success_by_suite_24h}) / clamp_min(({runs_by_suite_24h}), 1)) '
-        f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[24h]))), 1))'
+        f'and on(suite) (({runs_by_suite_24h}) > 0)) '
        f'or on(suite) ((0 * ({runs_by_suite_24h})) - 1))'
    )
    failures_by_suite_24h = (
        f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_failure_selector}}}[24h]))'
    )
    non_failure = PLATFORM_TEST_NON_FAILURE_STATUS
    current_gate_health_by_suite = (
        f'(100 * sum by (suite) (max by (suite, check) (({{{checks_selector},result=~"{non_failure}"}} > bool 0))) '
        f'/ clamp_min(sum by (suite) (max by (suite, check) (({{{checks_selector}}} > bool 0))), 1)) '
        f'or on(suite) ({selected_suite_zero})'
    )
    success_history_by_suite = (
        f'100 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[$__interval])) '
        f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[$__interval]))), 1))'
@ -3137,7 +3145,6 @@ def build_jobs_dashboard():
    average_coverage = f"(avg(({coverage_by_suite})) or on() vector(0))"
    suites_loc_violating = f'(sum((({smell_by_suite}) > bool 0)) or on() vector(0))'
    non_failure = PLATFORM_TEST_NON_FAILURE_STATUS
    checks_failed_total = f'(sum({{{checks_selector},result!~"{non_failure}"}}) or on() vector(0))'
    checks_failed_tests = (
        f'(sum(count by (suite) ({{{checks_selector},check=~"tests|unit|build",result!~"{non_failure}"}})) or on() vector(0))'
@ -3284,7 +3291,7 @@ def build_jobs_dashboard():
    panels.append(
        stat_panel(
            2,
-            "Success Rate (24h)",
+            "Run Reliability (24h)",
            success_rate_24h,
            {"h": 5, "w": 4, "x": 0, "y": 0},
            unit="percent",
@ -3296,7 +3303,7 @@ def build_jobs_dashboard():
    panels.append(
        stat_panel(
            3,
-            "Success Rate (30d)",
+            "Run Reliability (30d)",
            success_rate_30d,
            {"h": 5, "w": 4, "x": 4, "y": 0},
            unit="percent",
@ -3308,7 +3315,7 @@ def build_jobs_dashboard():
    panels.append(
        stat_panel(
            4,
-            "Failures (24h)",
+            "Failed Runs (24h)",
            failures_24h,
            {"h": 5, "w": 4, "x": 8, "y": 0},
            unit="none",
@ -3357,19 +3364,24 @@ def build_jobs_dashboard():
    panels.append(
        bargauge_panel(
            8,
-            "Failures by Suite (24h)",
+            "Current Gate Health by Suite",
-            failures_by_suite_24h,
+            current_gate_health_by_suite,
            {"h": 8, "w": 8, "x": 0, "y": 5},
-            unit="none",
+            unit="percent",
            instant=True,
            legend="{{suite}}",
-            thresholds=failures_thresholds,
+            sort_order="asc",
            thresholds=success_thresholds,
            decimals=2,
        )
    )
-    panels.append(
+    panels[-1]["description"] = (
-        bargauge_panel(
+        "Current pass percentage across the required gate dimensions reported by each suite. "
        "This is the fastest place to answer whether the latest suite quality signal is healthy."
    )
    reliability_suite_panel = bargauge_panel(
        9,
-            "Success Rate by Suite (24h)",
+        "Run Reliability by Suite (24h)",
        success_rate_by_suite_24h,
        {"h": 8, "w": 8, "x": 8, "y": 5},
        unit="percent",
@ -3379,7 +3391,14 @@ def build_jobs_dashboard():
        thresholds=success_thresholds,
        decimals=2,
    )
    reliability_suite_panel["description"] = (
        "Rolling CI run success rate. This can stay low after failed/debug runs even when "
        "Current Gate Health is green."
    )
    reliability_suite_panel["fieldConfig"]["defaults"]["mappings"] = [
        {"type": "value", "options": {"-1": {"text": "no runs"}}}
    ]
    panels.append(reliability_suite_panel)
    coverage_gap_panel = bargauge_panel(
        10,
        "Coverage Gap to 95% by Suite",
@ -3397,7 +3416,7 @@ def build_jobs_dashboard():
    history_panel = timeseries_panel(
        11,
-        "Success History by Suite",
+        "Run Reliability History by Suite",
        success_history_by_suite,
        {"h": 8, "w": 24, "x": 0, "y": 13},
        unit="percent",
--- a/services/monitoring/dashboards/atlas-jobs.json
+++ b/services/monitoring/dashboards/atlas-jobs.json
@ -7,7 +7,7 @@
    {
      "id": 2,
      "type": "stat",
-      "title": "Success Rate (24h)",
+      "title": "Run Reliability (24h)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -81,7 +81,7 @@
    {
      "id": 3,
      "type": "stat",
-      "title": "Success Rate (30d)",
+      "title": "Run Reliability (30d)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -155,7 +155,7 @@
    {
      "id": 4,
      "type": "stat",
-      "title": "Failures (24h)",
+      "title": "Failed Runs (24h)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -432,7 +432,7 @@
    {
      "id": 8,
      "type": "bargauge",
-      "title": "Failures by Suite (24h)",
+      "title": "Current Gate Health by Suite",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -445,81 +445,7 @@
      },
      "targets": [
        {
-          "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status!~\"ok|passed|success\"}[24h])))",
+          "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))",
          "refId": "A",
          "legendFormat": "{{suite}}",
          "instant": true
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "none",
          "min": 0,
          "max": null,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "yellow",
                "value": 1
              },
              {
                "color": "orange",
                "value": 3
              },
              {
                "color": "red",
                "value": 5
              }
            ]
          }
        },
        "overrides": []
      },
      "options": {
        "displayMode": "gradient",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        }
      },
      "transformations": [
        {
          "id": "sortBy",
          "options": {
            "fields": [
              "Value"
            ],
            "order": "desc"
          }
        }
      ]
    },
    {
      "id": 9,
      "type": "bargauge",
      "title": "Success Rate by Suite (24h)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
      },
      "gridPos": {
        "h": 8,
        "w": 8,
        "x": 8,
        "y": 5
      },
      "targets": [
        {
          "expr": "sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1))",
          "refId": "A",
          "legendFormat": "{{suite}}",
          "instant": true
@ -580,8 +506,99 @@
            "order": "asc"
          }
        }
      ],
      "description": "Current pass percentage across the required gate dimensions reported by each suite. This is the fastest place to answer whether the latest suite quality signal is healthy."
    },
    {
      "id": 9,
      "type": "bargauge",
      "title": "Run Reliability by Suite (24h)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
      },
      "gridPos": {
        "h": 8,
        "w": 8,
        "x": 8,
        "y": 5
      },
      "targets": [
        {
          "expr": "sort_desc(((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))) > 0)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h])))) - 1))",
          "refId": "A",
          "legendFormat": "{{suite}}",
          "instant": true
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "percent",
          "min": 0,
          "max": 100,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "red",
                "value": null
              },
              {
                "color": "orange",
                "value": 90
              },
              {
                "color": "yellow",
                "value": 93
              },
              {
                "color": "green",
                "value": 95
              },
              {
                "color": "blue",
                "value": 100
              }
            ]
          },
          "decimals": 2,
          "mappings": [
            {
              "type": "value",
              "options": {
                "-1": {
                  "text": "no runs"
                }
              }
            }
          ]
        },
        "overrides": []
      },
      "options": {
        "displayMode": "gradient",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        }
      },
      "transformations": [
        {
          "id": "sortBy",
          "options": {
            "fields": [
              "Value"
            ],
            "order": "asc"
          }
        }
      ],
      "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green."
    },
    {
      "id": 10,
      "type": "bargauge",
@ -661,7 +678,7 @@
    {
      "id": 11,
      "type": "timeseries",
-      "title": "Success History by Suite",
+      "title": "Run Reliability History by Suite",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -3380,7 +3397,7 @@
        "name": "suite",
        "label": "Suite",
        "type": "custom",
-        "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus|pegasus-health|pegasus_health,soteria : soteria,titan_iac : titan_iac|titan-iac,bstein_home : bstein_home|bstein-home,data_prepper : data_prepper|data-prepper",
+        "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper",
        "current": {
          "text": "All",
          "value": "$__all",
@ -3409,7 +3426,7 @@
          },
          {
            "text": "pegasus",
-            "value": "pegasus|pegasus-health|pegasus_health",
+            "value": "pegasus",
            "selected": false
          },
          {
@ -3419,24 +3436,24 @@
          },
          {
            "text": "titan_iac",
-            "value": "titan_iac|titan-iac",
+            "value": "titan_iac",
            "selected": false
          },
          {
            "text": "bstein_home",
-            "value": "bstein_home|bstein-home",
+            "value": "bstein_home",
            "selected": false
          },
          {
            "text": "data_prepper",
-            "value": "data_prepper|data-prepper",
+            "value": "data_prepper",
            "selected": false
          }
        ],
        "hide": 0,
        "multi": false,
        "includeAll": true,
-        "allValue": "ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper",
+        "allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper",
        "refresh": 1,
        "sort": 1,
        "skipUrlSync": false
--- a/services/monitoring/dashboards/atlas-testing.json
+++ b/services/monitoring/dashboards/atlas-testing.json
@ -7,7 +7,7 @@
    {
      "id": 2,
      "type": "stat",
-      "title": "Success Rate (24h)",
+      "title": "Run Reliability (24h)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -81,7 +81,7 @@
    {
      "id": 3,
      "type": "stat",
-      "title": "Success Rate (30d)",
+      "title": "Run Reliability (30d)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -155,7 +155,7 @@
    {
      "id": 4,
      "type": "stat",
-      "title": "Failures (24h)",
+      "title": "Failed Runs (24h)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -432,7 +432,7 @@
    {
      "id": 8,
      "type": "bargauge",
-      "title": "Failures by Suite (24h)",
+      "title": "Current Gate Health by Suite",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -445,81 +445,7 @@
      },
      "targets": [
        {
-          "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status!~\"ok|passed|success\"}[24h])))",
+          "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))",
          "refId": "A",
          "legendFormat": "{{suite}}",
          "instant": true
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "none",
          "min": 0,
          "max": null,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "yellow",
                "value": 1
              },
              {
                "color": "orange",
                "value": 3
              },
              {
                "color": "red",
                "value": 5
              }
            ]
          }
        },
        "overrides": []
      },
      "options": {
        "displayMode": "gradient",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        }
      },
      "transformations": [
        {
          "id": "sortBy",
          "options": {
            "fields": [
              "Value"
            ],
            "order": "desc"
          }
        }
      ]
    },
    {
      "id": 9,
      "type": "bargauge",
      "title": "Success Rate by Suite (24h)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
      },
      "gridPos": {
        "h": 8,
        "w": 8,
        "x": 8,
        "y": 5
      },
      "targets": [
        {
          "expr": "sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1))",
          "refId": "A",
          "legendFormat": "{{suite}}",
          "instant": true
@ -580,8 +506,99 @@
            "order": "asc"
          }
        }
      ],
      "description": "Current pass percentage across the required gate dimensions reported by each suite. This is the fastest place to answer whether the latest suite quality signal is healthy."
    },
    {
      "id": 9,
      "type": "bargauge",
      "title": "Run Reliability by Suite (24h)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
      },
      "gridPos": {
        "h": 8,
        "w": 8,
        "x": 8,
        "y": 5
      },
      "targets": [
        {
          "expr": "sort_desc(((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))) > 0)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h])))) - 1))",
          "refId": "A",
          "legendFormat": "{{suite}}",
          "instant": true
        }
      ],
      "fieldConfig": {
        "defaults": {
          "unit": "percent",
          "min": 0,
          "max": 100,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "red",
                "value": null
              },
              {
                "color": "orange",
                "value": 90
              },
              {
                "color": "yellow",
                "value": 93
              },
              {
                "color": "green",
                "value": 95
              },
              {
                "color": "blue",
                "value": 100
              }
            ]
          },
          "decimals": 2,
          "mappings": [
            {
              "type": "value",
              "options": {
                "-1": {
                  "text": "no runs"
                }
              }
            }
          ]
        },
        "overrides": []
      },
      "options": {
        "displayMode": "gradient",
        "orientation": "horizontal",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        }
      },
      "transformations": [
        {
          "id": "sortBy",
          "options": {
            "fields": [
              "Value"
            ],
            "order": "asc"
          }
        }
      ],
      "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green."
    },
    {
      "id": 10,
      "type": "bargauge",
@ -661,7 +678,7 @@
    {
      "id": 11,
      "type": "timeseries",
-      "title": "Success History by Suite",
+      "title": "Run Reliability History by Suite",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -3380,7 +3397,7 @@
        "name": "suite",
        "label": "Suite",
        "type": "custom",
-        "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus|pegasus-health|pegasus_health,soteria : soteria,titan_iac : titan_iac|titan-iac,bstein_home : bstein_home|bstein-home,data_prepper : data_prepper|data-prepper",
+        "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper",
        "current": {
          "text": "All",
          "value": "$__all",
@ -3409,7 +3426,7 @@
          },
          {
            "text": "pegasus",
-            "value": "pegasus|pegasus-health|pegasus_health",
+            "value": "pegasus",
            "selected": false
          },
          {
@ -3419,24 +3436,24 @@
          },
          {
            "text": "titan_iac",
-            "value": "titan_iac|titan-iac",
+            "value": "titan_iac",
            "selected": false
          },
          {
            "text": "bstein_home",
-            "value": "bstein_home|bstein-home",
+            "value": "bstein_home",
            "selected": false
          },
          {
            "text": "data_prepper",
-            "value": "data_prepper|data-prepper",
+            "value": "data_prepper",
            "selected": false
          }
        ],
        "hide": 0,
        "multi": false,
        "includeAll": true,
-        "allValue": "ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper",
+        "allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper",
        "refresh": 1,
        "sort": 1,
        "skipUrlSync": false
--- a/services/monitoring/grafana-dashboard-jobs.yaml
+++ b/services/monitoring/grafana-dashboard-jobs.yaml
@ -16,7 +16,7 @@ data:
        {
          "id": 2,
          "type": "stat",
-          "title": "Success Rate (24h)",
+          "title": "Run Reliability (24h)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -90,7 +90,7 @@ data:
        {
          "id": 3,
          "type": "stat",
-          "title": "Success Rate (30d)",
+          "title": "Run Reliability (30d)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -164,7 +164,7 @@ data:
        {
          "id": 4,
          "type": "stat",
-          "title": "Failures (24h)",
+          "title": "Failed Runs (24h)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -441,7 +441,7 @@ data:
        {
          "id": 8,
          "type": "bargauge",
-          "title": "Failures by Suite (24h)",
+          "title": "Current Gate Health by Suite",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -454,81 +454,7 @@ data:
          },
          "targets": [
            {
-              "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status!~\"ok|passed|success\"}[24h])))",
+              "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))",
              "refId": "A",
              "legendFormat": "{{suite}}",
              "instant": true
            }
          ],
          "fieldConfig": {
            "defaults": {
              "unit": "none",
              "min": 0,
              "max": null,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "yellow",
                    "value": 1
                  },
                  {
                    "color": "orange",
                    "value": 3
                  },
                  {
                    "color": "red",
                    "value": 5
                  }
                ]
              }
            },
            "overrides": []
          },
          "options": {
            "displayMode": "gradient",
            "orientation": "horizontal",
            "reduceOptions": {
              "calcs": [
                "lastNotNull"
              ],
              "fields": "",
              "values": false
            }
          },
          "transformations": [
            {
              "id": "sortBy",
              "options": {
                "fields": [
                  "Value"
                ],
                "order": "desc"
              }
            }
          ]
        },
        {
          "id": 9,
          "type": "bargauge",
          "title": "Success Rate by Suite (24h)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
          },
          "gridPos": {
            "h": 8,
            "w": 8,
            "x": 8,
            "y": 5
          },
          "targets": [
            {
              "expr": "sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1))",
              "refId": "A",
              "legendFormat": "{{suite}}",
              "instant": true
@ -589,8 +515,99 @@ data:
                "order": "asc"
              }
            }
          ],
          "description": "Current pass percentage across the required gate dimensions reported by each suite. This is the fastest place to answer whether the latest suite quality signal is healthy."
        },
        {
          "id": 9,
          "type": "bargauge",
          "title": "Run Reliability by Suite (24h)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
          },
          "gridPos": {
            "h": 8,
            "w": 8,
            "x": 8,
            "y": 5
          },
          "targets": [
            {
              "expr": "sort_desc(((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))) > 0)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h])))) - 1))",
              "refId": "A",
              "legendFormat": "{{suite}}",
              "instant": true
            }
          ],
          "fieldConfig": {
            "defaults": {
              "unit": "percent",
              "min": 0,
              "max": 100,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "red",
                    "value": null
                  },
                  {
                    "color": "orange",
                    "value": 90
                  },
                  {
                    "color": "yellow",
                    "value": 93
                  },
                  {
                    "color": "green",
                    "value": 95
                  },
                  {
                    "color": "blue",
                    "value": 100
                  }
                ]
              },
              "decimals": 2,
              "mappings": [
                {
                  "type": "value",
                  "options": {
                    "-1": {
                      "text": "no runs"
                    }
                  }
                }
              ]
            },
            "overrides": []
          },
          "options": {
            "displayMode": "gradient",
            "orientation": "horizontal",
            "reduceOptions": {
              "calcs": [
                "lastNotNull"
              ],
              "fields": "",
              "values": false
            }
          },
          "transformations": [
            {
              "id": "sortBy",
              "options": {
                "fields": [
                  "Value"
                ],
                "order": "asc"
              }
            }
          ],
          "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green."
        },
        {
          "id": 10,
          "type": "bargauge",
@ -670,7 +687,7 @@ data:
        {
          "id": 11,
          "type": "timeseries",
-          "title": "Success History by Suite",
+          "title": "Run Reliability History by Suite",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -3389,7 +3406,7 @@ data:
            "name": "suite",
            "label": "Suite",
            "type": "custom",
-            "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus|pegasus-health|pegasus_health,soteria : soteria,titan_iac : titan_iac|titan-iac,bstein_home : bstein_home|bstein-home,data_prepper : data_prepper|data-prepper",
+            "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper",
            "current": {
              "text": "All",
              "value": "$__all",
@ -3418,7 +3435,7 @@ data:
              },
              {
                "text": "pegasus",
-                "value": "pegasus|pegasus-health|pegasus_health",
+                "value": "pegasus",
                "selected": false
              },
              {
@ -3428,24 +3445,24 @@ data:
              },
              {
                "text": "titan_iac",
-                "value": "titan_iac|titan-iac",
+                "value": "titan_iac",
                "selected": false
              },
              {
                "text": "bstein_home",
-                "value": "bstein_home|bstein-home",
+                "value": "bstein_home",
                "selected": false
              },
              {
                "text": "data_prepper",
-                "value": "data_prepper|data-prepper",
+                "value": "data_prepper",
                "selected": false
              }
            ],
            "hide": 0,
            "multi": false,
            "includeAll": true,
-            "allValue": "ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper",
+            "allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper",
            "refresh": 1,
            "sort": 1,
            "skipUrlSync": false
--- a/services/monitoring/grafana-dashboard-testing.yaml
+++ b/services/monitoring/grafana-dashboard-testing.yaml
@ -16,7 +16,7 @@ data:
        {
          "id": 2,
          "type": "stat",
-          "title": "Success Rate (24h)",
+          "title": "Run Reliability (24h)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -90,7 +90,7 @@ data:
        {
          "id": 3,
          "type": "stat",
-          "title": "Success Rate (30d)",
+          "title": "Run Reliability (30d)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -164,7 +164,7 @@ data:
        {
          "id": 4,
          "type": "stat",
-          "title": "Failures (24h)",
+          "title": "Failed Runs (24h)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -441,7 +441,7 @@ data:
        {
          "id": 8,
          "type": "bargauge",
-          "title": "Failures by Suite (24h)",
+          "title": "Current Gate Health by Suite",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -454,81 +454,7 @@ data:
          },
          "targets": [
            {
-              "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status!~\"ok|passed|success\"}[24h])))",
+              "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))",
              "refId": "A",
              "legendFormat": "{{suite}}",
              "instant": true
            }
          ],
          "fieldConfig": {
            "defaults": {
              "unit": "none",
              "min": 0,
              "max": null,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "green",
                    "value": null
                  },
                  {
                    "color": "yellow",
                    "value": 1
                  },
                  {
                    "color": "orange",
                    "value": 3
                  },
                  {
                    "color": "red",
                    "value": 5
                  }
                ]
              }
            },
            "overrides": []
          },
          "options": {
            "displayMode": "gradient",
            "orientation": "horizontal",
            "reduceOptions": {
              "calcs": [
                "lastNotNull"
              ],
              "fields": "",
              "values": false
            }
          },
          "transformations": [
            {
              "id": "sortBy",
              "options": {
                "fields": [
                  "Value"
                ],
                "order": "desc"
              }
            }
          ]
        },
        {
          "id": 9,
          "type": "bargauge",
          "title": "Success Rate by Suite (24h)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
          },
          "gridPos": {
            "h": 8,
            "w": 8,
            "x": 8,
            "y": 5
          },
          "targets": [
            {
              "expr": "sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1))",
              "refId": "A",
              "legendFormat": "{{suite}}",
              "instant": true
@ -589,8 +515,99 @@ data:
                "order": "asc"
              }
            }
          ],
          "description": "Current pass percentage across the required gate dimensions reported by each suite. This is the fastest place to answer whether the latest suite quality signal is healthy."
        },
        {
          "id": 9,
          "type": "bargauge",
          "title": "Run Reliability by Suite (24h)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
          },
          "gridPos": {
            "h": 8,
            "w": 8,
            "x": 8,
            "y": 5
          },
          "targets": [
            {
              "expr": "sort_desc(((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))) > 0)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h])))) - 1))",
              "refId": "A",
              "legendFormat": "{{suite}}",
              "instant": true
            }
          ],
          "fieldConfig": {
            "defaults": {
              "unit": "percent",
              "min": 0,
              "max": 100,
              "thresholds": {
                "mode": "absolute",
                "steps": [
                  {
                    "color": "red",
                    "value": null
                  },
                  {
                    "color": "orange",
                    "value": 90
                  },
                  {
                    "color": "yellow",
                    "value": 93
                  },
                  {
                    "color": "green",
                    "value": 95
                  },
                  {
                    "color": "blue",
                    "value": 100
                  }
                ]
              },
              "decimals": 2,
              "mappings": [
                {
                  "type": "value",
                  "options": {
                    "-1": {
                      "text": "no runs"
                    }
                  }
                }
              ]
            },
            "overrides": []
          },
          "options": {
            "displayMode": "gradient",
            "orientation": "horizontal",
            "reduceOptions": {
              "calcs": [
                "lastNotNull"
              ],
              "fields": "",
              "values": false
            }
          },
          "transformations": [
            {
              "id": "sortBy",
              "options": {
                "fields": [
                  "Value"
                ],
                "order": "asc"
              }
            }
          ],
          "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green."
        },
        {
          "id": 10,
          "type": "bargauge",
@ -670,7 +687,7 @@ data:
        {
          "id": 11,
          "type": "timeseries",
-          "title": "Success History by Suite",
+          "title": "Run Reliability History by Suite",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -3389,7 +3406,7 @@ data:
            "name": "suite",
            "label": "Suite",
            "type": "custom",
-            "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus|pegasus-health|pegasus_health,soteria : soteria,titan_iac : titan_iac|titan-iac,bstein_home : bstein_home|bstein-home,data_prepper : data_prepper|data-prepper",
+            "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper",
            "current": {
              "text": "All",
              "value": "$__all",
@ -3418,7 +3435,7 @@ data:
              },
              {
                "text": "pegasus",
-                "value": "pegasus|pegasus-health|pegasus_health",
+                "value": "pegasus",
                "selected": false
              },
              {
@ -3428,24 +3445,24 @@ data:
              },
              {
                "text": "titan_iac",
-                "value": "titan_iac|titan-iac",
+                "value": "titan_iac",
                "selected": false
              },
              {
                "text": "bstein_home",
-                "value": "bstein_home|bstein-home",
+                "value": "bstein_home",
                "selected": false
              },
              {
                "text": "data_prepper",
-                "value": "data_prepper|data-prepper",
+                "value": "data_prepper",
                "selected": false
              }
            ],
            "hide": 0,
            "multi": false,
            "includeAll": true,
-            "allValue": "ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper",
+            "allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper",
            "refresh": 1,
            "sort": 1,
            "skipUrlSync": false