monitoring: clarify quality gate dashboard tooltips

2026-05-16 17:03:58 -03:00 · 2026-05-16 17:03:58 -03:00 · 588cc3aa14
commit 588cc3aa14
parent ad86195436
6 changed files with 38 additions and 38 deletions
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -1618,9 +1618,9 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
    "Enclosure Climate History": "Temperature, humidity, and VPD over time; smooth movement is healthy, sharp swings need attention.",
    "Fan Intensity History": "Fan levels from Off to 10; warmer colors mean stronger cooling response and more thermal pressure.",
    "Flux Source": "Git branch Flux is applying; this should normally be the intended production branch.",
-    "Run Reliability (24h)": "Percent of CI runs that completed successfully in 24h; higher is better for release confidence.",
-    "Failed Runs (24h)": "CI runs that failed in 24h; zero is good, any value means recent test signal is noisy.",
-    "Suites Reporting (24h)": "Suites with at least one CI run in 24h; full count means the dashboard signal is fresh.",
+    "Run Reliability (24h)": "Percent of published quality-gate runs that passed in 24h; higher means fresher healthy test signal.",
+    "Failed Runs (24h)": "Published quality-gate runs that failed in 24h; zero is good, any value needs a look.",
+    "Suites Reporting (24h)": "Suites with at least one published quality-gate run in 24h; higher means fresher dashboard signal.",
    "Avg Coverage": "Average latest line coverage across suites; higher means code is better protected by tests.",
    "LOC Clean Suites": "Suites with no source files over 500 LOC; full count is good for maintainability.",
    "GitOps Health": "Flux readiness and suspension health over time; blue is perfect, warmer colors mean drift or pause.",
@ -1654,20 +1654,20 @@ OVERVIEW_PANEL_DESCRIPTIONS = {


 TESTING_PANEL_DESCRIPTIONS = {
-    "Run Reliability (24h)": "Percent of selected CI runs that finished successfully in 24h; higher is better.",
-    "Run Reliability (30d)": "Percent of selected CI runs that finished successfully in 30d; higher shows stable automation.",
-    "Failed Runs (24h)": "Selected CI runs that failed in 24h; zero is good and anything else needs a look.",
-    "Runs (24h)": "Selected CI run count in 24h; zero means the dashboard may be stale.",
+    "Run Reliability (24h)": "Percent of selected quality-gate runs that passed in 24h; higher means fresher healthy test signal.",
+    "Run Reliability (30d)": "Percent of selected quality-gate runs that passed in 30d; higher means more stable test automation.",
+    "Failed Runs (24h)": "Selected quality-gate runs that failed in 24h; zero is good and anything else needs a look.",
+    "Runs (24h)": "Selected quality-gate run count in 24h; zero means the dashboard may be stale.",
    "Avg Coverage (%)": "Average latest line coverage for selected suites; higher means better test protection.",
    "Suites with LOC >500": "Selected suites with oversized source files; zero is good for maintainability.",
    "Current Gate Health by Suite": "Latest gate pass percent per suite; 100% means all required checks currently pass.",
-    "Run Reliability by Suite (24h)": "24h run success by suite; lower rows are worse and can lag after failed/debug runs.",
+    "Run Reliability by Suite (24h)": "24h quality-gate pass rate by suite; lower rows are worse and can lag after failed/debug runs.",
    "Coverage by Suite (Latest, gate 95)": "Latest suite coverage; 95%+ is acceptable and 100% is strongest.",
    "Files <=500 LOC by Suite (Latest)": "Percent of source files within the 500-line limit; higher is easier to maintain.",
    "Reliability And Run History": "Recent run, coverage, LOC, and category trends for selected suites.",
-    "Run Reliability by Suite (7d rolling)": "Seven-day rolling run success by suite; blue lanes mean stable CI.",
+    "Run Reliability by Suite (7d rolling)": "Seven-day rolling quality-gate pass rate by suite; blue lanes mean stable tests.",
    "Test Category Pass Rate History": "Pass rate by test category; use the Suite filter to focus on one project.",
-    "Daily Run Volume (Selected Scope)": "Rolling daily counts of successful and failed runs; volume explains confidence.",
+    "Daily Run Volume (Selected Scope)": "Rolling daily counts of published quality-gate runs; volume explains confidence.",
    "Coverage History by Suite": "Coverage over time by suite; rising lines mean better test protection.",
    "Files <=500 LOC History by Suite": "LOC compliance over time; blue lanes mean files stay within the size limit.",
    "Check Failure Rates By Suite": "Failure percent by check family; blue is zero failures, warmer colors show blockers.",
@ -4122,7 +4122,7 @@ def build_jobs_dashboard():
        decimals=2,
    )
    reliability_suite_panel["description"] = (
-        "Rolling CI run success rate. This can stay low after failed/debug runs even when "
+        "Rolling quality-gate pass rate. This can stay low after failed/debug runs even when "
        "Current Gate Health is green."
    )
    reliability_suite_panel["fieldConfig"]["defaults"]["mappings"] = [
@ -4136,7 +4136,7 @@ def build_jobs_dashboard():
        {"h": 8, "w": 24, "x": 0, "y": 13},
        thresholds=success_thresholds,
        description=(
-            "Seven-day rolling run success rate per suite. Each suite gets its own lane, "
+            "Seven-day rolling quality-gate pass rate per suite. Each suite gets its own lane, "
            "so brief failed/debug runs lower the lane color without creating unreadable 0/100 spikes."
        ),
    )
@ -4157,7 +4157,7 @@ def build_jobs_dashboard():
        legend_calcs=[],
    )
    run_volume_panel["description"] = (
-        "Twenty-four-hour rolling run counts for the selected suite/branch scope. "
+        "Twenty-four-hour rolling quality-gate run counts for the selected suite/branch scope. "
        "This is volume, not a pass-rate percentage."
    )
    run_volume_panel["fieldConfig"]["defaults"]["min"] = 0
@ -4251,8 +4251,8 @@ def build_jobs_dashboard():
            max_value=None,
            legend="{{suite}} - {{test}}",
            description=(
-                "Top tests inside each hourly bucket. A test can show only 1-2 here while the 30d panel "
-                "shows a larger total from earlier hours."
+                "Top failing tests inside each hourly bucket. Short current bars can still belong to tests "
+                "with larger long-window totals."
            ),
        )
    )
--- a/scripts/tests/test_dashboards_render_atlas.py
+++ b/scripts/tests/test_dashboards_render_atlas.py
@ -122,7 +122,7 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
    assert panels_by_title["Suites Reporting (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 13}
    suites_reporting_expr = panels_by_title["Suites Reporting (24h)"]["targets"][0]["expr"]
    assert "> bool 0" in suites_reporting_expr
-    assert "Suites with at least one CI run" in panels_by_title["Suites Reporting (24h)"]["description"]
+    assert "published quality-gate run" in panels_by_title["Suites Reporting (24h)"]["description"]
    assert panels_by_title["LOC Clean Suites"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 17}
    assert panels_by_title["GitOps Health"]["type"] == "state-timeline"
    assert panels_by_title["GitOps Health"]["gridPos"] == {"h": 6, "w": 6, "x": 15, "y": 7}
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@ -2195,7 +2195,7 @@
          "targetBlank": true
        }
      ],
-      "description": "Percent of CI runs that completed successfully in 24h; higher is better for release confidence."
+      "description": "Percent of published quality-gate runs that passed in 24h; higher means fresher healthy test signal."
    },
    {
      "id": 152,
@ -2277,7 +2277,7 @@
          "targetBlank": true
        }
      ],
-      "description": "CI runs that failed in 24h; zero is good, any value means recent test signal is noisy."
+      "description": "Published quality-gate runs that failed in 24h; zero is good, any value needs a look."
    },
    {
      "id": 153,
@ -2359,7 +2359,7 @@
          "targetBlank": true
        }
      ],
-      "description": "Suites with at least one CI run in 24h; full count means the dashboard signal is fresh."
+      "description": "Suites with at least one published quality-gate run in 24h; higher means fresher dashboard signal."
    },
    {
      "id": 154,
--- a/services/monitoring/dashboards/atlas-testing.json
+++ b/services/monitoring/dashboards/atlas-testing.json
@ -77,7 +77,7 @@
        },
        "textMode": "value"
      },
-      "description": "Percent of selected CI runs that finished successfully in 24h; higher is better."
+      "description": "Percent of selected quality-gate runs that passed in 24h; higher means fresher healthy test signal."
    },
    {
      "id": 3,
@ -152,7 +152,7 @@
        },
        "textMode": "value"
      },
-      "description": "Percent of selected CI runs that finished successfully in 30d; higher shows stable automation."
+      "description": "Percent of selected quality-gate runs that passed in 30d; higher means more stable test automation."
    },
    {
      "id": 4,
@ -226,7 +226,7 @@
        },
        "textMode": "value"
      },
-      "description": "Selected CI runs that failed in 24h; zero is good and anything else needs a look."
+      "description": "Selected quality-gate runs that failed in 24h; zero is good and anything else needs a look."
    },
    {
      "id": 5,
@ -288,7 +288,7 @@
        },
        "textMode": "value"
      },
-      "description": "Selected CI run count in 24h; zero means the dashboard may be stale."
+      "description": "Selected quality-gate run count in 24h; zero means the dashboard may be stale."
    },
    {
      "id": 6,
@ -623,7 +623,7 @@
          }
        }
      ],
-      "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green."
+      "description": "Rolling quality-gate pass rate. This can stay low after failed/debug runs even when Current Gate Health is green."
    },
    {
      "id": 17,
@ -827,7 +827,7 @@
          "id": 11,
          "type": "state-timeline",
          "title": "Run Reliability by Suite (7d rolling)",
-          "description": "Seven-day rolling run success rate per suite. Each suite gets its own lane, so brief failed/debug runs lower the lane color without creating unreadable 0/100 spikes.",
+          "description": "Seven-day rolling quality-gate pass rate per suite. Each suite gets its own lane, so brief failed/debug runs lower the lane color without creating unreadable 0/100 spikes.",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1144,7 +1144,7 @@
              "mode": "multi"
            }
          },
-          "description": "Twenty-four-hour rolling run counts for the selected suite/branch scope. This is volume, not a pass-rate percentage."
+          "description": "Twenty-four-hour rolling quality-gate run counts for the selected suite/branch scope. This is volume, not a pass-rate percentage."
        },
        {
          "id": 13,
@ -2411,7 +2411,7 @@
          "id": 145,
          "type": "state-timeline",
          "title": "Problematic Tests Over Time (Top failures)",
-          "description": "Top tests inside each hourly bucket. A test can show only 1-2 here while the 30d panel shows a larger total from earlier hours.",
+          "description": "Top failing tests inside each hourly bucket. Short current bars can still belong to tests with larger long-window totals.",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@ -2204,7 +2204,7 @@ data:
              "targetBlank": true
            }
          ],
-          "description": "Percent of CI runs that completed successfully in 24h; higher is better for release confidence."
+          "description": "Percent of published quality-gate runs that passed in 24h; higher means fresher healthy test signal."
        },
        {
          "id": 152,
@ -2286,7 +2286,7 @@ data:
              "targetBlank": true
            }
          ],
-          "description": "CI runs that failed in 24h; zero is good, any value means recent test signal is noisy."
+          "description": "Published quality-gate runs that failed in 24h; zero is good, any value needs a look."
        },
        {
          "id": 153,
@ -2368,7 +2368,7 @@ data:
              "targetBlank": true
            }
          ],
-          "description": "Suites with at least one CI run in 24h; full count means the dashboard signal is fresh."
+          "description": "Suites with at least one published quality-gate run in 24h; higher means fresher dashboard signal."
        },
        {
          "id": 154,
--- a/services/monitoring/grafana-dashboard-testing.yaml
+++ b/services/monitoring/grafana-dashboard-testing.yaml
@ -86,7 +86,7 @@ data:
            },
            "textMode": "value"
          },
-          "description": "Percent of selected CI runs that finished successfully in 24h; higher is better."
+          "description": "Percent of selected quality-gate runs that passed in 24h; higher means fresher healthy test signal."
        },
        {
          "id": 3,
@ -161,7 +161,7 @@ data:
            },
            "textMode": "value"
          },
-          "description": "Percent of selected CI runs that finished successfully in 30d; higher shows stable automation."
+          "description": "Percent of selected quality-gate runs that passed in 30d; higher means more stable test automation."
        },
        {
          "id": 4,
@ -235,7 +235,7 @@ data:
            },
            "textMode": "value"
          },
-          "description": "Selected CI runs that failed in 24h; zero is good and anything else needs a look."
+          "description": "Selected quality-gate runs that failed in 24h; zero is good and anything else needs a look."
        },
        {
          "id": 5,
@ -297,7 +297,7 @@ data:
            },
            "textMode": "value"
          },
-          "description": "Selected CI run count in 24h; zero means the dashboard may be stale."
+          "description": "Selected quality-gate run count in 24h; zero means the dashboard may be stale."
        },
        {
          "id": 6,
@ -632,7 +632,7 @@ data:
              }
            }
          ],
-          "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green."
+          "description": "Rolling quality-gate pass rate. This can stay low after failed/debug runs even when Current Gate Health is green."
        },
        {
          "id": 17,
@ -836,7 +836,7 @@ data:
              "id": 11,
              "type": "state-timeline",
              "title": "Run Reliability by Suite (7d rolling)",
-              "description": "Seven-day rolling run success rate per suite. Each suite gets its own lane, so brief failed/debug runs lower the lane color without creating unreadable 0/100 spikes.",
+              "description": "Seven-day rolling quality-gate pass rate per suite. Each suite gets its own lane, so brief failed/debug runs lower the lane color without creating unreadable 0/100 spikes.",
              "datasource": {
                "type": "prometheus",
                "uid": "atlas-vm"
@ -1153,7 +1153,7 @@ data:
                  "mode": "multi"
                }
              },
-              "description": "Twenty-four-hour rolling run counts for the selected suite/branch scope. This is volume, not a pass-rate percentage."
+              "description": "Twenty-four-hour rolling quality-gate run counts for the selected suite/branch scope. This is volume, not a pass-rate percentage."
            },
            {
              "id": 13,
@ -2420,7 +2420,7 @@ data:
              "id": 145,
              "type": "state-timeline",
              "title": "Problematic Tests Over Time (Top failures)",
-              "description": "Top tests inside each hourly bucket. A test can show only 1-2 here while the 30d panel shows a larger total from earlier hours.",
+              "description": "Top failing tests inside each hourly bucket. Short current bars can still belong to tests with larger long-window totals.",
              "datasource": {
                "type": "prometheus",
                "uid": "atlas-vm"