From 23146aaa8a6c5c221c92f530c35edc67294a8294 Mon Sep 17 00:00:00 2001 From: jenkins Date: Wed, 22 Apr 2026 14:34:40 -0300 Subject: [PATCH] monitoring(testing): clean canonical suite rows --- scripts/dashboards_render_atlas.py | 73 ++++--- .../monitoring/dashboards/atlas-jobs.json | 191 ++++++++++-------- .../monitoring/dashboards/atlas-testing.json | 191 ++++++++++-------- .../monitoring/grafana-dashboard-jobs.yaml | 191 ++++++++++-------- .../monitoring/grafana-dashboard-testing.yaml | 191 ++++++++++-------- 5 files changed, 462 insertions(+), 375 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index a65b48c5..ba8cd64b 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -529,7 +529,7 @@ PLATFORM_TEST_SUITE_MATCHER = "|".join( PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite) for suite in PLATFORM_TEST_SUITE_NAMES ) PLATFORM_TEST_SUITE_CANONICAL_MATCHER = "|".join(PLATFORM_TEST_SUITE_NAMES) -PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER = PLATFORM_TEST_SUITE_MATCHER +PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER = PLATFORM_TEST_SUITE_CANONICAL_MATCHER PLATFORM_TEST_SUCCESS_EVENTS_30D = ( f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[30d])) or on() vector(0))' ) @@ -1098,15 +1098,12 @@ def testing_suite_variable(): options = [ { "text": suite, - "value": PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite), + "value": suite, "selected": False, } for suite in PLATFORM_TEST_SUITE_NAMES ] - query = ",".join( - f"{suite} : {PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite)}" - for suite in PLATFORM_TEST_SUITE_NAMES - ) + query = ",".join(f"{suite} : {suite}" for suite in PLATFORM_TEST_SUITE_NAMES) return { "name": "suite", "label": "Suite", @@ -3111,13 +3108,24 @@ def build_jobs_dashboard(): ) success_rate_24h = f"100 * ({success_24h}) / clamp_min(({runs_24h}), 1)" success_rate_30d = f"100 * ({success_30d}) / clamp_min(({runs_30d}), 1)" + runs_by_suite_24h = f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[24h]))' + success_by_suite_24h = ( + f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[24h]))' + ) success_rate_by_suite_24h = ( - f'sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[24h]))) ' - f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[24h]))), 1))' + f'sort_desc(((100 * ({success_by_suite_24h}) / clamp_min(({runs_by_suite_24h}), 1)) ' + f'and on(suite) (({runs_by_suite_24h}) > 0)) ' + f'or on(suite) ((0 * ({runs_by_suite_24h})) - 1))' ) failures_by_suite_24h = ( f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_failure_selector}}}[24h]))' ) + non_failure = PLATFORM_TEST_NON_FAILURE_STATUS + current_gate_health_by_suite = ( + f'(100 * sum by (suite) (max by (suite, check) (({{{checks_selector},result=~"{non_failure}"}} > bool 0))) ' + f'/ clamp_min(sum by (suite) (max by (suite, check) (({{{checks_selector}}} > bool 0))), 1)) ' + f'or on(suite) ({selected_suite_zero})' + ) success_history_by_suite = ( f'100 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[$__interval])) ' f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[$__interval]))), 1))' @@ -3137,7 +3145,6 @@ def build_jobs_dashboard(): average_coverage = f"(avg(({coverage_by_suite})) or on() vector(0))" suites_loc_violating = f'(sum((({smell_by_suite}) > bool 0)) or on() vector(0))' - non_failure = PLATFORM_TEST_NON_FAILURE_STATUS checks_failed_total = f'(sum({{{checks_selector},result!~"{non_failure}"}}) or on() vector(0))' checks_failed_tests = ( f'(sum(count by (suite) ({{{checks_selector},check=~"tests|unit|build",result!~"{non_failure}"}})) or on() vector(0))' @@ -3284,7 +3291,7 @@ def build_jobs_dashboard(): panels.append( stat_panel( 2, - "Success Rate (24h)", + "Run Reliability (24h)", success_rate_24h, {"h": 5, "w": 4, "x": 0, "y": 0}, unit="percent", @@ -3296,7 +3303,7 @@ def build_jobs_dashboard(): panels.append( stat_panel( 3, - "Success Rate (30d)", + "Run Reliability (30d)", success_rate_30d, {"h": 5, "w": 4, "x": 4, "y": 0}, unit="percent", @@ -3308,7 +3315,7 @@ def build_jobs_dashboard(): panels.append( stat_panel( 4, - "Failures (24h)", + "Failed Runs (24h)", failures_24h, {"h": 5, "w": 4, "x": 8, "y": 0}, unit="none", @@ -3357,21 +3364,9 @@ def build_jobs_dashboard(): panels.append( bargauge_panel( 8, - "Failures by Suite (24h)", - failures_by_suite_24h, + "Current Gate Health by Suite", + current_gate_health_by_suite, {"h": 8, "w": 8, "x": 0, "y": 5}, - unit="none", - instant=True, - legend="{{suite}}", - thresholds=failures_thresholds, - ) - ) - panels.append( - bargauge_panel( - 9, - "Success Rate by Suite (24h)", - success_rate_by_suite_24h, - {"h": 8, "w": 8, "x": 8, "y": 5}, unit="percent", instant=True, legend="{{suite}}", @@ -3380,6 +3375,30 @@ def build_jobs_dashboard(): decimals=2, ) ) + panels[-1]["description"] = ( + "Current pass percentage across the required gate dimensions reported by each suite. " + "This is the fastest place to answer whether the latest suite quality signal is healthy." + ) + reliability_suite_panel = bargauge_panel( + 9, + "Run Reliability by Suite (24h)", + success_rate_by_suite_24h, + {"h": 8, "w": 8, "x": 8, "y": 5}, + unit="percent", + instant=True, + legend="{{suite}}", + sort_order="asc", + thresholds=success_thresholds, + decimals=2, + ) + reliability_suite_panel["description"] = ( + "Rolling CI run success rate. This can stay low after failed/debug runs even when " + "Current Gate Health is green." + ) + reliability_suite_panel["fieldConfig"]["defaults"]["mappings"] = [ + {"type": "value", "options": {"-1": {"text": "no runs"}}} + ] + panels.append(reliability_suite_panel) coverage_gap_panel = bargauge_panel( 10, "Coverage Gap to 95% by Suite", @@ -3397,7 +3416,7 @@ def build_jobs_dashboard(): history_panel = timeseries_panel( 11, - "Success History by Suite", + "Run Reliability History by Suite", success_history_by_suite, {"h": 8, "w": 24, "x": 0, "y": 13}, unit="percent", diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 1a38da09..aaa00cb3 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -7,7 +7,7 @@ { "id": 2, "type": "stat", - "title": "Success Rate (24h)", + "title": "Run Reliability (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -81,7 +81,7 @@ { "id": 3, "type": "stat", - "title": "Success Rate (30d)", + "title": "Run Reliability (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -155,7 +155,7 @@ { "id": 4, "type": "stat", - "title": "Failures (24h)", + "title": "Failed Runs (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -432,7 +432,7 @@ { "id": 8, "type": "bargauge", - "title": "Failures by Suite (24h)", + "title": "Current Gate Health by Suite", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -445,81 +445,7 @@ }, "targets": [ { - "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status!~\"ok|passed|success\"}[24h])))", - "refId": "A", - "legendFormat": "{{suite}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 3 - }, - { - "color": "red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 9, - "type": "bargauge", - "title": "Success Rate by Suite (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 5 - }, - "targets": [ - { - "expr": "sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1))", + "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", "refId": "A", "legendFormat": "{{suite}}", "instant": true @@ -580,7 +506,98 @@ "order": "asc" } } - ] + ], + "description": "Current pass percentage across the required gate dimensions reported by each suite. This is the fastest place to answer whether the latest suite quality signal is healthy." + }, + { + "id": 9, + "type": "bargauge", + "title": "Run Reliability by Suite (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 5 + }, + "targets": [ + { + "expr": "sort_desc(((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))) > 0)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h])))) - 1))", + "refId": "A", + "legendFormat": "{{suite}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "yellow", + "value": 93 + }, + { + "color": "green", + "value": 95 + }, + { + "color": "blue", + "value": 100 + } + ] + }, + "decimals": 2, + "mappings": [ + { + "type": "value", + "options": { + "-1": { + "text": "no runs" + } + } + } + ] + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "asc" + } + } + ], + "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green." }, { "id": 10, @@ -661,7 +678,7 @@ { "id": 11, "type": "timeseries", - "title": "Success History by Suite", + "title": "Run Reliability History by Suite", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -3380,7 +3397,7 @@ "name": "suite", "label": "Suite", "type": "custom", - "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus|pegasus-health|pegasus_health,soteria : soteria,titan_iac : titan_iac|titan-iac,bstein_home : bstein_home|bstein-home,data_prepper : data_prepper|data-prepper", + "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper", "current": { "text": "All", "value": "$__all", @@ -3409,7 +3426,7 @@ }, { "text": "pegasus", - "value": "pegasus|pegasus-health|pegasus_health", + "value": "pegasus", "selected": false }, { @@ -3419,24 +3436,24 @@ }, { "text": "titan_iac", - "value": "titan_iac|titan-iac", + "value": "titan_iac", "selected": false }, { "text": "bstein_home", - "value": "bstein_home|bstein-home", + "value": "bstein_home", "selected": false }, { "text": "data_prepper", - "value": "data_prepper|data-prepper", + "value": "data_prepper", "selected": false } ], "hide": 0, "multi": false, "includeAll": true, - "allValue": "ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper", + "allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper", "refresh": 1, "sort": 1, "skipUrlSync": false diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index 8fb5b8ed..71ccae86 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -7,7 +7,7 @@ { "id": 2, "type": "stat", - "title": "Success Rate (24h)", + "title": "Run Reliability (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -81,7 +81,7 @@ { "id": 3, "type": "stat", - "title": "Success Rate (30d)", + "title": "Run Reliability (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -155,7 +155,7 @@ { "id": 4, "type": "stat", - "title": "Failures (24h)", + "title": "Failed Runs (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -432,7 +432,7 @@ { "id": 8, "type": "bargauge", - "title": "Failures by Suite (24h)", + "title": "Current Gate Health by Suite", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -445,81 +445,7 @@ }, "targets": [ { - "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status!~\"ok|passed|success\"}[24h])))", - "refId": "A", - "legendFormat": "{{suite}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 3 - }, - { - "color": "red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 9, - "type": "bargauge", - "title": "Success Rate by Suite (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 5 - }, - "targets": [ - { - "expr": "sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1))", + "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", "refId": "A", "legendFormat": "{{suite}}", "instant": true @@ -580,7 +506,98 @@ "order": "asc" } } - ] + ], + "description": "Current pass percentage across the required gate dimensions reported by each suite. This is the fastest place to answer whether the latest suite quality signal is healthy." + }, + { + "id": 9, + "type": "bargauge", + "title": "Run Reliability by Suite (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 5 + }, + "targets": [ + { + "expr": "sort_desc(((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))) > 0)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h])))) - 1))", + "refId": "A", + "legendFormat": "{{suite}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "yellow", + "value": 93 + }, + { + "color": "green", + "value": 95 + }, + { + "color": "blue", + "value": 100 + } + ] + }, + "decimals": 2, + "mappings": [ + { + "type": "value", + "options": { + "-1": { + "text": "no runs" + } + } + } + ] + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "asc" + } + } + ], + "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green." }, { "id": 10, @@ -661,7 +678,7 @@ { "id": 11, "type": "timeseries", - "title": "Success History by Suite", + "title": "Run Reliability History by Suite", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -3380,7 +3397,7 @@ "name": "suite", "label": "Suite", "type": "custom", - "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus|pegasus-health|pegasus_health,soteria : soteria,titan_iac : titan_iac|titan-iac,bstein_home : bstein_home|bstein-home,data_prepper : data_prepper|data-prepper", + "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper", "current": { "text": "All", "value": "$__all", @@ -3409,7 +3426,7 @@ }, { "text": "pegasus", - "value": "pegasus|pegasus-health|pegasus_health", + "value": "pegasus", "selected": false }, { @@ -3419,24 +3436,24 @@ }, { "text": "titan_iac", - "value": "titan_iac|titan-iac", + "value": "titan_iac", "selected": false }, { "text": "bstein_home", - "value": "bstein_home|bstein-home", + "value": "bstein_home", "selected": false }, { "text": "data_prepper", - "value": "data_prepper|data-prepper", + "value": "data_prepper", "selected": false } ], "hide": 0, "multi": false, "includeAll": true, - "allValue": "ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper", + "allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper", "refresh": 1, "sort": 1, "skipUrlSync": false diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 135d992f..929b4744 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -16,7 +16,7 @@ data: { "id": 2, "type": "stat", - "title": "Success Rate (24h)", + "title": "Run Reliability (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -90,7 +90,7 @@ data: { "id": 3, "type": "stat", - "title": "Success Rate (30d)", + "title": "Run Reliability (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -164,7 +164,7 @@ data: { "id": 4, "type": "stat", - "title": "Failures (24h)", + "title": "Failed Runs (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -441,7 +441,7 @@ data: { "id": 8, "type": "bargauge", - "title": "Failures by Suite (24h)", + "title": "Current Gate Health by Suite", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -454,81 +454,7 @@ data: }, "targets": [ { - "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status!~\"ok|passed|success\"}[24h])))", - "refId": "A", - "legendFormat": "{{suite}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 3 - }, - { - "color": "red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 9, - "type": "bargauge", - "title": "Success Rate by Suite (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 5 - }, - "targets": [ - { - "expr": "sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1))", + "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", "refId": "A", "legendFormat": "{{suite}}", "instant": true @@ -589,7 +515,98 @@ data: "order": "asc" } } - ] + ], + "description": "Current pass percentage across the required gate dimensions reported by each suite. This is the fastest place to answer whether the latest suite quality signal is healthy." + }, + { + "id": 9, + "type": "bargauge", + "title": "Run Reliability by Suite (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 5 + }, + "targets": [ + { + "expr": "sort_desc(((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))) > 0)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h])))) - 1))", + "refId": "A", + "legendFormat": "{{suite}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "yellow", + "value": 93 + }, + { + "color": "green", + "value": 95 + }, + { + "color": "blue", + "value": 100 + } + ] + }, + "decimals": 2, + "mappings": [ + { + "type": "value", + "options": { + "-1": { + "text": "no runs" + } + } + } + ] + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "asc" + } + } + ], + "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green." }, { "id": 10, @@ -670,7 +687,7 @@ data: { "id": 11, "type": "timeseries", - "title": "Success History by Suite", + "title": "Run Reliability History by Suite", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -3389,7 +3406,7 @@ data: "name": "suite", "label": "Suite", "type": "custom", - "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus|pegasus-health|pegasus_health,soteria : soteria,titan_iac : titan_iac|titan-iac,bstein_home : bstein_home|bstein-home,data_prepper : data_prepper|data-prepper", + "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper", "current": { "text": "All", "value": "$__all", @@ -3418,7 +3435,7 @@ data: }, { "text": "pegasus", - "value": "pegasus|pegasus-health|pegasus_health", + "value": "pegasus", "selected": false }, { @@ -3428,24 +3445,24 @@ data: }, { "text": "titan_iac", - "value": "titan_iac|titan-iac", + "value": "titan_iac", "selected": false }, { "text": "bstein_home", - "value": "bstein_home|bstein-home", + "value": "bstein_home", "selected": false }, { "text": "data_prepper", - "value": "data_prepper|data-prepper", + "value": "data_prepper", "selected": false } ], "hide": 0, "multi": false, "includeAll": true, - "allValue": "ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper", + "allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper", "refresh": 1, "sort": 1, "skipUrlSync": false diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index d6371f1f..401901da 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -16,7 +16,7 @@ data: { "id": 2, "type": "stat", - "title": "Success Rate (24h)", + "title": "Run Reliability (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -90,7 +90,7 @@ data: { "id": 3, "type": "stat", - "title": "Success Rate (30d)", + "title": "Run Reliability (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -164,7 +164,7 @@ data: { "id": 4, "type": "stat", - "title": "Failures (24h)", + "title": "Failed Runs (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -441,7 +441,7 @@ data: { "id": 8, "type": "bargauge", - "title": "Failures by Suite (24h)", + "title": "Current Gate Health by Suite", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -454,81 +454,7 @@ data: }, "targets": [ { - "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status!~\"ok|passed|success\"}[24h])))", - "refId": "A", - "legendFormat": "{{suite}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 3 - }, - { - "color": "red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 9, - "type": "bargauge", - "title": "Success Rate by Suite (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 5 - }, - "targets": [ - { - "expr": "sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1))", + "expr": "sort((100 * sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",result=~\"ok|passed|success|not_applicable|skipped|na|n/a\"} > bool 0))) / clamp_min(sum by (suite) (max by (suite, check) (({__name__=~\".*_quality_gate_checks_total\",suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"} > bool 0))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", "refId": "A", "legendFormat": "{{suite}}", "instant": true @@ -589,7 +515,98 @@ data: "order": "asc" } } - ] + ], + "description": "Current pass percentage across the required gate dimensions reported by each suite. This is the fastest place to answer whether the latest suite quality signal is healthy." + }, + { + "id": 9, + "type": "bargauge", + "title": "Run Reliability by Suite (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 5 + }, + "targets": [ + { + "expr": "sort_desc(((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h]))) > 0)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[24h])))) - 1))", + "refId": "A", + "legendFormat": "{{suite}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "yellow", + "value": 93 + }, + { + "color": "green", + "value": 95 + }, + { + "color": "blue", + "value": 100 + } + ] + }, + "decimals": 2, + "mappings": [ + { + "type": "value", + "options": { + "-1": { + "text": "no runs" + } + } + } + ] + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "asc" + } + } + ], + "description": "Rolling CI run success rate. This can stay low after failed/debug runs even when Current Gate Health is green." }, { "id": 10, @@ -670,7 +687,7 @@ data: { "id": 11, "type": "timeseries", - "title": "Success History by Suite", + "title": "Run Reliability History by Suite", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -3389,7 +3406,7 @@ data: "name": "suite", "label": "Suite", "type": "custom", - "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus|pegasus-health|pegasus_health,soteria : soteria,titan_iac : titan_iac|titan-iac,bstein_home : bstein_home|bstein-home,data_prepper : data_prepper|data-prepper", + "query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper", "current": { "text": "All", "value": "$__all", @@ -3418,7 +3435,7 @@ data: }, { "text": "pegasus", - "value": "pegasus|pegasus-health|pegasus_health", + "value": "pegasus", "selected": false }, { @@ -3428,24 +3445,24 @@ data: }, { "text": "titan_iac", - "value": "titan_iac|titan-iac", + "value": "titan_iac", "selected": false }, { "text": "bstein_home", - "value": "bstein_home|bstein-home", + "value": "bstein_home", "selected": false }, { "text": "data_prepper", - "value": "data_prepper|data-prepper", + "value": "data_prepper", "selected": false } ], "hide": 0, "multi": false, "includeAll": true, - "allValue": "ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper", + "allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper", "refresh": 1, "sort": 1, "skipUrlSync": false