From 0de90d622a0d62c12ed086e9a41f06c6f06da15a Mon Sep 17 00:00:00 2001 From: jenkins Date: Mon, 18 May 2026 14:18:01 -0300 Subject: [PATCH] monitoring(testing): clarify CI run health labels --- scripts/dashboards_render_atlas.py | 44 +++++++++---------- .../monitoring/dashboards/atlas-overview.json | 4 +- .../monitoring/dashboards/atlas-testing.json | 28 ++++++------ .../grafana-dashboard-overview.yaml | 4 +- .../monitoring/grafana-dashboard-testing.yaml | 28 ++++++------ 5 files changed, 54 insertions(+), 54 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 39e3d8f0..9282847b 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -1628,7 +1628,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = { "Enclosure Climate History": "Temperature, humidity, and VPD over time; smooth movement is healthy, sharp swings need attention.", "Fan Intensity History": "Fan levels from Off to 10; warmer colors mean stronger cooling response and more thermal pressure.", "Flux Source": "Git branch Flux is applying; this should normally be the intended production branch.", - "Run Reliability (24h)": "Percent of published quality-gate runs that passed in 24h; higher means fresher healthy test signal.", + "CI Run Success (24h)": "Percent of published quality-gate CI runs that completed successfully in 24h; this is run health, not individual test pass rate.", "Failed Runs (24h)": "Published quality-gate runs that failed in 24h; zero is good, any value needs a look.", "Suites With Runs (24h)": "Configured suites with at least one published quality-gate run in 24h; full count means the dashboard is fresh.", "Avg Coverage": "Average latest line coverage across suites; higher means code is better protected by tests.", @@ -1664,18 +1664,18 @@ OVERVIEW_PANEL_DESCRIPTIONS = { TESTING_PANEL_DESCRIPTIONS = { - "Run Reliability (24h)": "Percent of selected quality-gate runs that passed in 24h; higher means fresher healthy test signal.", - "Run Reliability (30d)": "Percent of selected quality-gate runs that passed in 30d; higher means more stable test automation.", + "CI Run Success Rate (24h)": "Percent of selected quality-gate CI runs that completed successfully in 24h; this is run health, not individual test pass rate.", + "CI Run Success Rate (30d)": "Percent of selected quality-gate CI runs that completed successfully in 30d; higher means more stable automation.", "Failed Runs (24h)": "Selected quality-gate runs that failed in 24h; zero is good and anything else needs a look.", - "Runs (24h)": "Selected quality-gate run count in 24h; zero means the dashboard may be stale.", + "CI Runs (24h)": "Selected quality-gate CI run count in 24h; zero means the dashboard may be stale.", "Avg Coverage (%)": "Average latest line coverage for selected suites; higher means better test protection.", "Suites with LOC >500": "Selected suites with oversized source files; zero is good for maintainability.", - "Current Gate Health by Suite": "Latest gate pass percent per suite from the daily freshness window; 100% means required checks recently passed.", - "Run Reliability by Suite (24h)": "24h quality-gate pass rate by suite; lower rows are worse and can lag after failed/debug runs.", + "Latest Gate Checks Passing by Suite": "Latest required gate checks passing by suite in the daily freshness window; this includes tests, coverage, LOC, style, and related gates.", + "CI Run Success by Suite (24h)": "24h CI run success rate by suite; lower rows mean recent jobs failed, aborted, or could not complete cleanly.", "Coverage by Suite (Latest, gate 95)": "Latest suite coverage; 95%+ is acceptable and 100% is strongest.", "Files <=500 LOC by Suite (Latest)": "Percent of source files within the 500-line limit; higher is easier to maintain.", - "Reliability And Run History": "Recent run, coverage, LOC, and category trends for selected suites.", - "Run Reliability by Suite (7d rolling)": "Seven-day rolling quality-gate pass rate by suite; blue lanes mean stable tests.", + "CI Runs And Test History": "Recent CI run, coverage, LOC, and test-category trends for selected suites.", + "CI Run Success by Suite (7d rolling)": "Seven-day rolling CI run success rate by suite; blue lanes mean recent runs are completing cleanly.", "Test Category Pass Rate History": "Pass rate by test category; use the Suite filter to focus on one project.", "Daily Run Volume (Selected Scope)": "Rolling daily counts of published quality-gate runs; volume explains confidence.", "Coverage History by Suite": "Coverage over time by suite; rising lines mean better test protection.", @@ -2336,7 +2336,7 @@ def build_overview(): flux_source["options"]["text"] = {"titleSize": 10, "valueSize": 14} panels.append(flux_source) for panel_id, title, expr, y_pos, unit, decimals, thresholds, links in [ - (151, "Run Reliability (24h)", TEST_SUCCESS_RATE_24H, 9, "percent", 1, test_success_thresholds, "atlas-testing"), + (151, "CI Run Success (24h)", TEST_SUCCESS_RATE_24H, 9, "percent", 1, test_success_thresholds, "atlas-testing"), (152, "Failed Runs (24h)", TEST_FAILURES_24H_TOTAL, 11, "none", 0, failure_count_thresholds, "atlas-testing"), (153, "Suites With Runs (24h)", PLATFORM_TEST_ACTIVE_SUITES_24H, 13, "none", 0, perfect_count_thresholds, "atlas-testing"), (154, "Avg Coverage", overview_avg_coverage, 15, "percent", 1, test_success_thresholds, "atlas-testing"), @@ -4079,7 +4079,7 @@ def build_jobs_dashboard(): panels.append( stat_panel( 2, - "Run Reliability (24h)", + "CI Run Success Rate (24h)", success_rate_24h, {"h": 5, "w": 4, "x": 0, "y": 0}, unit="percent", @@ -4091,7 +4091,7 @@ def build_jobs_dashboard(): panels.append( stat_panel( 3, - "Run Reliability (30d)", + "CI Run Success Rate (30d)", success_rate_30d, {"h": 5, "w": 4, "x": 4, "y": 0}, unit="percent", @@ -4114,7 +4114,7 @@ def build_jobs_dashboard(): panels.append( stat_panel( 5, - "Runs (24h)", + "CI Runs (24h)", runs_24h, {"h": 5, "w": 4, "x": 12, "y": 0}, unit="none", @@ -4152,7 +4152,7 @@ def build_jobs_dashboard(): panels.append( bargauge_panel( 8, - "Current Gate Health by Suite", + "Latest Gate Checks Passing by Suite", current_gate_health_by_suite, {"h": 8, "w": 8, "x": 0, "y": 5}, unit="percent", @@ -4167,12 +4167,12 @@ def build_jobs_dashboard(): {"type": "value", "options": {"-1": {"text": "missing"}}} ] panels[-1]["description"] = ( - "Latest pass percentage across required gate dimensions in the daily freshness window. " - "100% is clean; missing means the suite has not published recent gate data." + "Latest pass percentage across required gate checks in the daily freshness window. " + "100% means tests and supporting gates recently passed; missing means no fresh gate data." ) reliability_suite_panel = bargauge_panel( 9, - "Run Reliability by Suite (24h)", + "CI Run Success by Suite (24h)", success_rate_by_suite_24h, {"h": 8, "w": 8, "x": 8, "y": 5}, unit="percent", @@ -4183,8 +4183,8 @@ def build_jobs_dashboard(): decimals=2, ) reliability_suite_panel["description"] = ( - "Rolling quality-gate pass rate. This can stay low after failed/debug runs even when " - "Current Gate Health is green." + "24h CI run success rate. This can stay low after failed, aborted, or debug runs even " + "when the latest gate checks are green." ) reliability_suite_panel["fieldConfig"]["defaults"]["mappings"] = [ {"type": "value", "options": {"-1": {"text": "no runs"}}} @@ -4192,13 +4192,13 @@ def build_jobs_dashboard(): panels.append(reliability_suite_panel) history_panel = state_timeline_panel( 11, - "Run Reliability by Suite (7d rolling)", + "CI Run Success by Suite (7d rolling)", success_history_by_suite, {"h": 8, "w": 24, "x": 0, "y": 13}, thresholds=success_thresholds, description=( - "Seven-day rolling quality-gate pass rate per suite. Each suite gets its own lane, " - "so brief failed/debug runs lower the lane color without creating unreadable 0/100 spikes." + "Seven-day rolling CI run success rate per suite. Each suite gets its own lane, " + "so failed or aborted runs lower the lane color without creating unreadable 0/100 spikes." ), ) panels.append(history_panel) @@ -4664,7 +4664,7 @@ def build_jobs_dashboard(): compact_panels.extend( [ - row_panel(500, "Reliability And Run History", 11, panels=children([11, 153, 12, 13, 14])), + row_panel(500, "CI Runs And Test History", 11, panels=children([11, 153, 12, 13, 14])), row_panel( 501, "Check Failure Rates By Suite", diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index afeda905..cdbdd90f 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -2114,7 +2114,7 @@ { "id": 151, "type": "stat", - "title": "Run Reliability (24h)", + "title": "CI Run Success (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2195,7 +2195,7 @@ "targetBlank": true } ], - "description": "Percent of published quality-gate runs that passed in 24h; higher means fresher healthy test signal." + "description": "Percent of published quality-gate CI runs that completed successfully in 24h; this is run health, not individual test pass rate." }, { "id": 152, diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index 9d060f97..aa77f74e 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -7,7 +7,7 @@ { "id": 2, "type": "stat", - "title": "Run Reliability (24h)", + "title": "CI Run Success Rate (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -77,12 +77,12 @@ }, "textMode": "value" }, - "description": "Percent of selected quality-gate runs that passed in 24h; higher means fresher healthy test signal." + "description": "Percent of selected quality-gate CI runs that completed successfully in 24h; this is run health, not individual test pass rate." }, { "id": 3, "type": "stat", - "title": "Run Reliability (30d)", + "title": "CI Run Success Rate (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -152,7 +152,7 @@ }, "textMode": "value" }, - "description": "Percent of selected quality-gate runs that passed in 30d; higher means more stable test automation." + "description": "Percent of selected quality-gate CI runs that completed successfully in 30d; higher means more stable automation." }, { "id": 4, @@ -231,7 +231,7 @@ { "id": 5, "type": "stat", - "title": "Runs (24h)", + "title": "CI Runs (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -288,7 +288,7 @@ }, "textMode": "value" }, - "description": "Selected quality-gate run count in 24h; zero means the dashboard may be stale." + "description": "Selected quality-gate CI run count in 24h; zero means the dashboard may be stale." }, { "id": 6, @@ -442,7 +442,7 @@ { "id": 8, "type": "bargauge", - "title": "Current Gate Health by Suite", + "title": "Latest Gate Checks Passing by Suite", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -530,12 +530,12 @@ } } ], - "description": "Latest pass percentage across required gate dimensions in the daily freshness window. 100% is clean; missing means the suite has not published recent gate data." + "description": "Latest pass percentage across required gate checks in the daily freshness window. 100% means tests and supporting gates recently passed; missing means no fresh gate data." }, { "id": 9, "type": "bargauge", - "title": "Run Reliability by Suite (24h)", + "title": "CI Run Success by Suite (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -623,7 +623,7 @@ } } ], - "description": "Rolling quality-gate pass rate. This can stay low after failed/debug runs even when Current Gate Health is green." + "description": "24h CI run success rate. This can stay low after failed, aborted, or debug runs even when the latest gate checks are green." }, { "id": 17, @@ -814,7 +814,7 @@ { "id": 500, "type": "row", - "title": "Reliability And Run History", + "title": "CI Runs And Test History", "gridPos": { "h": 1, "w": 24, @@ -826,8 +826,8 @@ { "id": 11, "type": "state-timeline", - "title": "Run Reliability by Suite (7d rolling)", - "description": "Seven-day rolling quality-gate pass rate per suite. Each suite gets its own lane, so brief failed/debug runs lower the lane color without creating unreadable 0/100 spikes.", + "title": "CI Run Success by Suite (7d rolling)", + "description": "Seven-day rolling CI run success rate per suite. Each suite gets its own lane, so failed or aborted runs lower the lane color without creating unreadable 0/100 spikes.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1299,7 +1299,7 @@ } } ], - "description": "Recent run, coverage, LOC, and category trends for selected suites." + "description": "Recent CI run, coverage, LOC, and test-category trends for selected suites." }, { "id": 501, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 4e32d3c0..df603bbc 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -2123,7 +2123,7 @@ data: { "id": 151, "type": "stat", - "title": "Run Reliability (24h)", + "title": "CI Run Success (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2204,7 +2204,7 @@ data: "targetBlank": true } ], - "description": "Percent of published quality-gate runs that passed in 24h; higher means fresher healthy test signal." + "description": "Percent of published quality-gate CI runs that completed successfully in 24h; this is run health, not individual test pass rate." }, { "id": 152, diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 8b8186e9..27a60302 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -16,7 +16,7 @@ data: { "id": 2, "type": "stat", - "title": "Run Reliability (24h)", + "title": "CI Run Success Rate (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -86,12 +86,12 @@ data: }, "textMode": "value" }, - "description": "Percent of selected quality-gate runs that passed in 24h; higher means fresher healthy test signal." + "description": "Percent of selected quality-gate CI runs that completed successfully in 24h; this is run health, not individual test pass rate." }, { "id": 3, "type": "stat", - "title": "Run Reliability (30d)", + "title": "CI Run Success Rate (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -161,7 +161,7 @@ data: }, "textMode": "value" }, - "description": "Percent of selected quality-gate runs that passed in 30d; higher means more stable test automation." + "description": "Percent of selected quality-gate CI runs that completed successfully in 30d; higher means more stable automation." }, { "id": 4, @@ -240,7 +240,7 @@ data: { "id": 5, "type": "stat", - "title": "Runs (24h)", + "title": "CI Runs (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -297,7 +297,7 @@ data: }, "textMode": "value" }, - "description": "Selected quality-gate run count in 24h; zero means the dashboard may be stale." + "description": "Selected quality-gate CI run count in 24h; zero means the dashboard may be stale." }, { "id": 6, @@ -451,7 +451,7 @@ data: { "id": 8, "type": "bargauge", - "title": "Current Gate Health by Suite", + "title": "Latest Gate Checks Passing by Suite", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -539,12 +539,12 @@ data: } } ], - "description": "Latest pass percentage across required gate dimensions in the daily freshness window. 100% is clean; missing means the suite has not published recent gate data." + "description": "Latest pass percentage across required gate checks in the daily freshness window. 100% means tests and supporting gates recently passed; missing means no fresh gate data." }, { "id": 9, "type": "bargauge", - "title": "Run Reliability by Suite (24h)", + "title": "CI Run Success by Suite (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -632,7 +632,7 @@ data: } } ], - "description": "Rolling quality-gate pass rate. This can stay low after failed/debug runs even when Current Gate Health is green." + "description": "24h CI run success rate. This can stay low after failed, aborted, or debug runs even when the latest gate checks are green." }, { "id": 17, @@ -823,7 +823,7 @@ data: { "id": 500, "type": "row", - "title": "Reliability And Run History", + "title": "CI Runs And Test History", "gridPos": { "h": 1, "w": 24, @@ -835,8 +835,8 @@ data: { "id": 11, "type": "state-timeline", - "title": "Run Reliability by Suite (7d rolling)", - "description": "Seven-day rolling quality-gate pass rate per suite. Each suite gets its own lane, so brief failed/debug runs lower the lane color without creating unreadable 0/100 spikes.", + "title": "CI Run Success by Suite (7d rolling)", + "description": "Seven-day rolling CI run success rate per suite. Each suite gets its own lane, so failed or aborted runs lower the lane color without creating unreadable 0/100 spikes.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1308,7 +1308,7 @@ data: } } ], - "description": "Recent run, coverage, LOC, and category trends for selected suites." + "description": "Recent CI run, coverage, LOC, and test-category trends for selected suites." }, { "id": 501,