diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 9c095e59..c34137ce 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -1169,7 +1169,7 @@ def testing_case_variable(): "name": "test", "label": "Test Case", "type": "query", - "query": f'label_values(platform_quality_gate_test_case_result{{suite=~"${{suite:regex}}",branch=~"${{branch:regex}}",test!="__no_test_cases__",{PLATFORM_TEST_EXPORT_FILTER}}}, test)', + "query": f'label_values(platform_quality_gate_test_case_result{{suite=~"${{suite:regex}}",branch!="",branch=~"${{branch:regex}}",test!="",test!="__no_test_cases__",{PLATFORM_TEST_EXPORT_FILTER}}}, test)', "current": {"text": "All", "value": "$__all", "selected": True}, "options": [], "hide": 0, @@ -1187,7 +1187,7 @@ def testing_branch_variable(): "name": "branch", "label": "Branch", "type": "query", - "query": f'label_values(platform_quality_gate_build_info{{suite=~"${{suite:regex}}",{PLATFORM_TEST_EXPORT_FILTER}}}, branch)', + "query": f'label_values(platform_quality_gate_build_info{{suite=~"${{suite:regex}}",branch!="",{PLATFORM_TEST_EXPORT_FILTER}}}, branch)', "current": {"text": "All", "value": "$__all", "selected": True}, "options": [], "hide": 0, @@ -3385,9 +3385,7 @@ def build_jobs_dashboard(): coverage_metric_selector = f'__name__=~".*_quality_gate_coverage_percent",suite=~"{suite_var}",{exported}' workspace_coverage_selector = f'suite=~"{suite_var}",{exported}' smell_selector = f'suite=~"{suite_var}",{exported}' - test_case_selector = f'suite=~"{suite_var}",branch=~"{branch_var}",test=~"{test_var}",test!="__no_test_cases__",{exported}' - all_test_case_selector = f'suite=~"{suite_var}",branch=~"{branch_var}",test!="__no_test_cases__",{exported}' - build_info_selector = f'suite=~"{suite_var}",branch=~"{branch_var}",{exported}' + build_info_selector = f'suite=~"{suite_var}",branch!="",branch=~"{branch_var}",{exported}' selected_suite_universe = ( f'(sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[30d])) >= bool 0)' ) @@ -3488,59 +3486,51 @@ def build_jobs_dashboard(): return f"(100 * ({state_checks}) / clamp_min(({total_checks}), 1)) and on(suite) (({total_checks}) > 0)" rollup_failed_tests = ( - f'sum by (suite, test) (platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch=~"{branch_var}",test!="__no_test_cases__",status="failed"}})' + f'sum by (suite, test) (platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}})' ) raw_failed_tests = ( - f'sum by (suite, test) (max_over_time(platform_quality_gate_test_case_result{{{all_test_case_selector},status="failed"}}[$__interval]))' + f'sum by (suite, test) (max_over_time(platform_quality_gate_test_case_result{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",{exported},status="failed"}}[$__interval]))' ) problematic_tests_history_core = f"topk(12, (({rollup_failed_tests}) or on(suite, test) ({raw_failed_tests})))" - problematic_tests_history = f"({problematic_tests_history_core}) or on() vector(0)" + problematic_tests_history = problematic_tests_history_core rollup_failed_tests_30d = ( - f'sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch=~"{branch_var}",test!="__no_test_cases__",status="failed"}}[30d:1h]))' + f'sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}[30d:1h]))' ) raw_failed_tests_30d = ( - f'sum by (suite, test) (increase(platform_quality_gate_test_case_result{{{all_test_case_selector},status="failed"}}[30d]))' + f'sum by (suite, test) (increase(platform_quality_gate_test_case_result{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",{exported},status="failed"}}[30d]))' ) worst_test_per_suite_core = ( f"topk by (suite) (1, (({rollup_failed_tests_30d}) or on(suite, test) ({raw_failed_tests_30d})))" ) - worst_test_per_suite = f"({worst_test_per_suite_core}) or on() vector(0)" + worst_test_per_suite = worst_test_per_suite_core - def _selected_status_history(status: str) -> str: - rollup = ( - f'sum by (suite) (platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch=~"{branch_var}",test=~"{test_var}",test!="__no_test_cases__",status="{status}"}})' + def _selected_status_volume(status: str) -> str: + return ( + f'(sum(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",' + f'branch=~"{branch_var}",test!="",test=~"{test_var}",test!="__no_test_cases__",' + f'status="{status}"}}) or on() vector(0))' ) - raw = ( - f'sum by (suite) (max_over_time(platform_quality_gate_test_case_result{{{test_case_selector},status="{status}"}}[$__interval]))' - ) - return f"(({rollup}) or on(suite) ({raw}) or on(suite) ({selected_suite_zero}))" - selected_passed_history = _selected_status_history("passed") - selected_failed_history = _selected_status_history("failed") - selected_skipped_history = _selected_status_history("skipped") - selected_total_history = ( - f'(sum by (suite) (platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch=~"{branch_var}",test=~"{test_var}",test!="__no_test_cases__",status=~"passed|failed|error|skipped"}}) ' - f'or on(suite) sum by (suite) (max_over_time(platform_quality_gate_test_case_result{{{test_case_selector},status=~"passed|failed|error|skipped"}}[$__interval])))' - ) selected_test_pass_fail = [ { "refId": "A", - "expr": selected_passed_history, - "legendFormat": "{{suite}} passed", + "expr": _selected_status_volume("passed"), + "legendFormat": "Passed", }, { "refId": "B", - "expr": selected_failed_history, - "legendFormat": "{{suite}} failed", + "expr": _selected_status_volume("failed"), + "legendFormat": "Failed", }, { "refId": "C", - "expr": selected_skipped_history, - "legendFormat": "{{suite}} skipped", + "expr": _selected_status_volume("skipped"), + "legendFormat": "Skipped", }, ] selected_test_pass_rate = ( - f"((100 * ({selected_passed_history}) / clamp_min(({selected_total_history}), 1)) or on(suite) ({selected_suite_zero}))" + f'avg by (suite) (platform_quality:test_case_pass_rate:percent_1h{{suite=~"{suite_var}",' + f'branch!="",branch=~"{branch_var}",test!="",test=~"{test_var}",test!="__no_test_cases__"}})' ) recent_branch_evidence = ( f'sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[30d])))' @@ -3628,7 +3618,25 @@ def build_jobs_dashboard(): *, description: str, thresholds: dict, + unit: str = "percent", + min_value: int | float | None = 0, + max_value: int | float | None = 100, + legend: str = "{{suite}}", ) -> dict: + defaults = { + "color": {"mode": "thresholds"}, + "unit": unit, + "thresholds": thresholds, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": True, + }, + } + if min_value is not None: + defaults["min"] = min_value + if max_value is not None: + defaults["max"] = max_value panel = { "id": panel_id, "type": "state-timeline", @@ -3636,20 +3644,9 @@ def build_jobs_dashboard(): "description": description, "datasource": PROM_DS, "gridPos": grid, - "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{suite}}"}], + "targets": [{"expr": expr, "refId": "A", "legendFormat": legend}], "fieldConfig": { - "defaults": { - "color": {"mode": "thresholds"}, - "unit": "percent", - "min": 0, - "max": 100, - "thresholds": thresholds, - "custom": { - "fillOpacity": 70, - "lineWidth": 0, - "spanNulls": True, - }, - }, + "defaults": defaults, "overrides": [], }, "options": { @@ -3860,63 +3857,47 @@ def build_jobs_dashboard(): "Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." ) for index, (label, regex) in enumerate(check_dimensions[:4]): - panel = timeseries_panel( + panel = _state_timeline_panel( start_id + index, f"{label} {title_prefix}", _check_state_percent_series(regex, failed), {"h": 7, "w": 6, "x": index * 6, "y": y}, - unit="percent", - legend="{{suite}}", - legend_display="list", - legend_placement="bottom", - legend_calcs=[], + thresholds=trend_thresholds, + description=trend_description, ) - panel["description"] = trend_description - panel["fieldConfig"]["defaults"]["thresholds"] = trend_thresholds - panel["fieldConfig"]["defaults"]["min"] = 0 - panel["fieldConfig"]["defaults"]["max"] = 100 - panel["fieldConfig"]["defaults"].setdefault("custom", {})["spanNulls"] = True - panel["fieldConfig"]["defaults"]["custom"]["showPoints"] = "never" - panel["fieldConfig"]["defaults"]["custom"]["lineWidth"] = 2 panels.append(panel) for index, (label, regex) in enumerate(check_dimensions[4:]): - panel = timeseries_panel( + panel = _state_timeline_panel( start_id + 4 + index, f"{label} {title_prefix}", _check_state_percent_series(regex, failed), {"h": 7, "w": 8, "x": index * 8, "y": y + 7}, - unit="percent", - legend="{{suite}}", - legend_display="list", - legend_placement="bottom", - legend_calcs=[], + thresholds=trend_thresholds, + description=trend_description, ) - panel["description"] = trend_description - panel["fieldConfig"]["defaults"]["thresholds"] = trend_thresholds - panel["fieldConfig"]["defaults"]["min"] = 0 - panel["fieldConfig"]["defaults"]["max"] = 100 - panel["fieldConfig"]["defaults"].setdefault("custom", {})["spanNulls"] = True - panel["fieldConfig"]["defaults"]["custom"]["showPoints"] = "never" - panel["fieldConfig"]["defaults"]["custom"]["lineWidth"] = 2 panels.append(panel) _append_check_trends(130, "Failure Rate", True, 29) _append_check_trends(138, "Healthy Rate", False, 43) panels.append( - timeseries_panel( + _state_timeline_panel( 145, "Problematic Tests Over Time (Top failures)", problematic_tests_history, {"h": 8, "w": 12, "x": 0, "y": 57}, + thresholds=failures_thresholds, unit="none", + min_value=0, + max_value=None, legend="{{suite}} - {{test}}", - legend_display="list", - legend_placement="right", - legend_calcs=[], - links=jenkins_suite_links(), - data_links=jenkins_latest_artifact_data_links(), + description=( + "Top failing test cases over time, using memoized hourly rollups. " + "Blank branch/test labels and placeholder no-test-case rows are excluded." + ), ) ) + panels[-1]["links"] = jenkins_suite_links() + panels[-1]["fieldConfig"]["defaults"]["links"] = jenkins_latest_artifact_data_links() panels.append( bargauge_panel( 147, @@ -3948,22 +3929,32 @@ def build_jobs_dashboard(): data_links=jenkins_artifact_data_links(), ) ) - selected_pass_rate_panel = timeseries_panel( + panels[-1]["description"] = ( + "Stacked hourly outcome volume for the selected suite/branch/test scope. " + "This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans." + ) + panels[-1]["fieldConfig"]["defaults"]["min"] = 0 + panels[-1]["fieldConfig"]["defaults"]["custom"] = { + "drawStyle": "bars", + "barAlignment": 0, + "lineWidth": 0, + "fillOpacity": 70, + "stacking": {"mode": "normal", "group": "A"}, + } + selected_pass_rate_panel = _state_timeline_panel( 152, "Selected Test Pass Rate History", selected_test_pass_rate, {"h": 8, "w": 12, "x": 12, "y": 65}, - unit="percent", + thresholds=success_thresholds, legend="{{suite}}", - legend_display="list", - legend_placement="bottom", - legend_calcs=[], - links=jenkins_suite_links(), - data_links=jenkins_artifact_data_links(), + description=( + "Average pass rate per suite for the selected test filter, using memoized hourly " + "test-case pass-rate rollups instead of raw historical scans." + ), ) - selected_pass_rate_panel["fieldConfig"]["defaults"]["min"] = 0 - selected_pass_rate_panel["fieldConfig"]["defaults"]["max"] = 100 - selected_pass_rate_panel["fieldConfig"]["defaults"]["thresholds"] = success_thresholds + selected_pass_rate_panel["links"] = jenkins_suite_links() + selected_pass_rate_panel["fieldConfig"]["defaults"]["links"] = jenkins_artifact_data_links() panels.append(selected_pass_rate_panel) coverage_panel = bargauge_panel( @@ -4078,7 +4069,7 @@ def build_jobs_dashboard(): stat_panel( 32, "Sonar Projects (Selected)", - f'(count(sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}"}}) or on() vector(0))', + f'(count(max by (project_key) (sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}"}})) or on() vector(0))', {"h": 6, "w": 4, "x": 4, "y": 88}, unit="none", instant=True, @@ -4099,23 +4090,32 @@ def build_jobs_dashboard(): sonar_status_mix_panel = pie_panel( 34, "Sonar Gate Status Mix (Selected)", - f'count by (status) (sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}"}})', - {"h": 6, "w": 6, "x": 12, "y": 88}, + f'count by (status) (max by (project_key, status) (sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}"}}))', + {"h": 6, "w": 4, "x": 12, "y": 88}, ) sonar_status_mix_panel["targets"][0]["legendFormat"] = "{{status}}" panels.append(sonar_status_mix_panel) panels.append( - bargauge_panel( + _state_timeline_panel( 35, "Projects Failing Sonar Gate", - f'(sort_desc(count by (project_key) (sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}",status!~"OK|ok"}})) ' - f'or on() label_replace(vector(0), "project_key", "none", "__name__", ".*"))', - {"h": 6, "w": 6, "x": 18, "y": 88}, + f'max by (project_key) ((max by (project_key, status) (sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}",status!~"OK|ok"}})) * 0 + 1)', + {"h": 6, "w": 8, "x": 16, "y": 88}, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": dark_green, "value": None}, + {"color": dark_red, "value": 1}, + ], + }, unit="none", - instant=True, + min_value=0, + max_value=1, legend="{{project_key}}", - sort_order="desc", - thresholds=failures_thresholds, + description=( + "Projects observed with a non-OK SonarQube gate status over time. " + "The query deduplicates pod/service endpoint scrapes before rendering." + ), ) ) panels.append( @@ -4222,8 +4222,8 @@ def build_jobs_dashboard(): 31: {"h": 6, "w": 4, "x": 0, "y": 111}, 32: {"h": 6, "w": 4, "x": 4, "y": 111}, 33: {"h": 6, "w": 4, "x": 8, "y": 111}, - 34: {"h": 6, "w": 6, "x": 12, "y": 111}, - 35: {"h": 6, "w": 6, "x": 18, "y": 111}, + 34: {"h": 6, "w": 4, "x": 12, "y": 111}, + 35: {"h": 6, "w": 8, "x": 16, "y": 111}, } for panel_id, grid in row_layout.items(): panel_by_id[panel_id]["gridPos"] = grid diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index b8b71ef3..b0b84e7d 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -193,10 +193,30 @@ def test_jobs_dashboard_collapses_heavy_drilldowns_for_light_first_paint(): assert "SonarQube API Up" in nested_panels_by_title failure_rate_panel = nested_panels_by_title["Coverage Failure Rate"] + assert failure_rate_panel["type"] == "state-timeline" assert failure_rate_panel["fieldConfig"]["defaults"]["unit"] == "percent" assert failure_rate_panel["fieldConfig"]["defaults"]["max"] == 100 assert "increase(" not in failure_rate_panel["targets"][0]["expr"] pass_rate_panel = nested_panels_by_title["Selected Test Pass Rate History"] - assert "platform_quality_gate_test_case_result" in pass_rate_panel["targets"][0]["expr"] - assert "platform_quality:test_case_pass_rate:percent_1h" not in pass_rate_panel["targets"][0]["expr"] + assert pass_rate_panel["type"] == "state-timeline" + assert "platform_quality:test_case_pass_rate:percent_1h" in pass_rate_panel["targets"][0]["expr"] + assert "platform_quality_gate_test_case_result" not in pass_rate_panel["targets"][0]["expr"] + + pass_fail_panel = nested_panels_by_title["Selected Test Pass/Fail History"] + assert pass_fail_panel["fieldConfig"]["defaults"]["custom"]["drawStyle"] == "bars" + assert all( + "platform_quality:test_case_status:count_1h" in target["expr"] + for target in pass_fail_panel["targets"] + ) + + problematic_panel = nested_panels_by_title["Problematic Tests Over Time (Top failures)"] + assert problematic_panel["type"] == "state-timeline" + assert 'test!=""' in problematic_panel["targets"][0]["expr"] + assert "vector(0)" not in problematic_panel["targets"][0]["expr"] + + sonar_mix_panel = nested_panels_by_title["Sonar Gate Status Mix (Selected)"] + sonar_failing_panel = nested_panels_by_title["Projects Failing Sonar Gate"] + assert sonar_mix_panel["gridPos"]["w"] == 4 + assert sonar_failing_panel["gridPos"]["w"] == 8 + assert sonar_failing_panel["type"] == "state-timeline" diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 4cf05596..787d9350 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -830,8 +830,6 @@ "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -861,7 +859,9 @@ "fillOpacity": 70, "lineWidth": 0, "spanNulls": true - } + }, + "min": 0, + "max": 100 }, "overrides": [] }, @@ -960,8 +960,6 @@ "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -991,7 +989,9 @@ "fillOpacity": 70, "lineWidth": 0, "spanNulls": true - } + }, + "min": 0, + "max": 100 }, "overrides": [] }, @@ -1036,8 +1036,6 @@ "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -1067,7 +1065,9 @@ "fillOpacity": 70, "lineWidth": 0, "spanNulls": true - } + }, + "min": 0, + "max": 100 }, "overrides": [] }, @@ -1100,8 +1100,9 @@ "panels": [ { "id": 130, - "type": "timeseries", + "type": "state-timeline", "title": "Tests Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1121,6 +1122,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1143,31 +1147,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 131, - "type": "timeseries", + "type": "state-timeline", "title": "Coverage Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1187,6 +1194,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1209,31 +1219,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 132, - "type": "timeseries", + "type": "state-timeline", "title": "LOC Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1253,6 +1266,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1275,31 +1291,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 133, - "type": "timeseries", + "type": "state-timeline", "title": "Style Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1319,6 +1338,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1341,31 +1363,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 134, - "type": "timeseries", + "type": "state-timeline", "title": "Gate Glue Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1385,6 +1410,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1407,31 +1435,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 135, - "type": "timeseries", + "type": "state-timeline", "title": "SonarQube Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1451,6 +1482,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1473,31 +1507,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 136, - "type": "timeseries", + "type": "state-timeline", "title": "Supply Chain Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1517,6 +1554,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1539,26 +1579,28 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } } ] }, @@ -1576,8 +1618,9 @@ "panels": [ { "id": 138, - "type": "timeseries", + "type": "state-timeline", "title": "Tests Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1597,6 +1640,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1623,31 +1669,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 139, - "type": "timeseries", + "type": "state-timeline", "title": "Coverage Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1667,6 +1716,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1693,31 +1745,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 140, - "type": "timeseries", + "type": "state-timeline", "title": "LOC Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1737,6 +1792,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1763,31 +1821,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 141, - "type": "timeseries", + "type": "state-timeline", "title": "Style Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1807,6 +1868,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1833,31 +1897,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 142, - "type": "timeseries", + "type": "state-timeline", "title": "Gate Glue Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1877,6 +1944,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1903,31 +1973,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 143, - "type": "timeseries", + "type": "state-timeline", "title": "SonarQube Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1947,6 +2020,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1973,31 +2049,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 144, - "type": "timeseries", + "type": "state-timeline", "title": "Supply Chain Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2017,6 +2096,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -2043,26 +2125,28 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } } ] }, @@ -2080,8 +2164,9 @@ "panels": [ { "id": 145, - "type": "timeseries", + "type": "state-timeline", "title": "Problematic Tests Over Time (Top failures)", + "description": "Top failing test cases over time, using memoized hourly rollups. Blank branch/test labels and placeholder no-test-case rows are excluded.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2094,14 +2179,44 @@ }, "targets": [ { - "expr": "(topk(12, ((sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",status=\"failed\"})) or on(suite, test) (sum by (suite, test) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[$__interval])))))) or on() vector(0)", + "expr": "topk(12, ((sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"})) or on(suite, test) (sum by (suite, test) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[$__interval])))))", "refId": "A", "legendFormat": "{{suite}} - {{test}}" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "none", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-green", + "value": null + }, + { + "color": "dark-yellow", + "value": 1 + }, + { + "color": "dark-orange", + "value": 3 + }, + { + "color": "dark-red", + "value": 5 + } + ] + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, "links": [ { "title": "Open latest artifacts", @@ -2118,12 +2233,15 @@ "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", - "placement": "right" + "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } }, "links": [ @@ -2240,7 +2358,7 @@ }, "targets": [ { - "expr": "sort_desc((topk by (suite) (1, ((sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",status=\"failed\"}[30d:1h]))) or on(suite, test) (sum by (suite, test) (increase(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[30d])))))) or on() vector(0))", + "expr": "sort_desc(topk by (suite) (1, ((sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[30d:1h]))) or on(suite, test) (sum by (suite, test) (increase(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[30d]))))))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{test}}", "instant": true @@ -2433,18 +2551,18 @@ "targets": [ { "refId": "A", - "expr": "((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"passed\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"passed\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", - "legendFormat": "{{suite}} passed" + "expr": "(sum(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"passed\"}) or on() vector(0))", + "legendFormat": "Passed" }, { "refId": "B", - "expr": "((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"failed\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", - "legendFormat": "{{suite}} failed" + "expr": "(sum(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"failed\"}) or on() vector(0))", + "legendFormat": "Failed" }, { "refId": "C", - "expr": "((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"skipped\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"skipped\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", - "legendFormat": "{{suite}} skipped" + "expr": "(sum(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"skipped\"}) or on() vector(0))", + "legendFormat": "Skipped" } ], "fieldConfig": { @@ -2461,7 +2579,18 @@ "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/", "targetBlank": true } - ] + ], + "min": 0, + "custom": { + "drawStyle": "bars", + "barAlignment": 0, + "lineWidth": 0, + "fillOpacity": 70, + "stacking": { + "mode": "normal", + "group": "A" + } + } }, "overrides": [] }, @@ -2570,12 +2699,14 @@ "url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/", "targetBlank": true } - ] + ], + "description": "Stacked hourly outcome volume for the selected suite/branch/test scope. This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans." }, { "id": 152, - "type": "timeseries", + "type": "state-timeline", "title": "Selected Test Pass Rate History", + "description": "Average pass rate per suite for the selected test filter, using memoized hourly test-case pass-rate rollups instead of raw historical scans.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2588,28 +2719,17 @@ }, "targets": [ { - "expr": "((100 * (((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"passed\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"passed\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))) / clamp_min(((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=~\"passed|failed|error|skipped\"}) or on(suite) sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=~\"passed|failed|error|skipped\"}[$__interval])))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", + "expr": "avg by (suite) (platform_quality:test_case_pass_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\"})", "refId": "A", "legendFormat": "{{suite}}" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", - "links": [ - { - "title": "Open build artifacts", - "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/artifact/", - "targetBlank": true - }, - { - "title": "Open build", - "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/", - "targetBlank": true - } - ], - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -2634,17 +2754,39 @@ "value": 100 } ] - } + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100, + "links": [ + { + "title": "Open build artifacts", + "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/artifact/", + "targetBlank": true + }, + { + "title": "Open build", + "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/", + "targetBlank": true + } + ] }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } }, "links": [ @@ -3267,7 +3409,7 @@ }, "targets": [ { - "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d])))", + "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d])))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{branch}}", "instant": true @@ -3434,7 +3576,7 @@ }, "targets": [ { - "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[30d])))", + "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[30d])))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{branch}}", "instant": true @@ -3684,7 +3826,7 @@ }, "targets": [ { - "expr": "(count(sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"}) or on() vector(0))", + "expr": "(count(max by (project_key) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"})) or on() vector(0))", "refId": "A", "instant": true } @@ -3816,13 +3958,13 @@ }, "gridPos": { "h": 6, - "w": 6, + "w": 4, "x": 12, "y": 111 }, "targets": [ { - "expr": "count by (status) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"})", + "expr": "count by (status) (max by (project_key, status) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"}))", "refId": "A", "legendFormat": "{{status}}" } @@ -3859,24 +4001,24 @@ }, { "id": 35, - "type": "bargauge", + "type": "state-timeline", "title": "Projects Failing Sonar Gate", + "description": "Projects observed with a non-OK SonarQube gate status over time. The query deduplicates pod/service endpoint scrapes before rendering.", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, - "w": 6, - "x": 18, + "w": 8, + "x": 16, "y": 111 }, "targets": [ { - "expr": "sort_desc((sort_desc(count by (project_key) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\",status!~\"OK|ok\"})) or on() label_replace(vector(0), \"project_key\", \"none\", \"__name__\", \".*\")))", + "expr": "max by (project_key) ((max by (project_key, status) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\",status!~\"OK|ok\"})) * 0 + 1)", "refId": "A", - "legendFormat": "{{project_key}}", - "instant": true + "legendFormat": "{{project_key}}" } ], "fieldConfig": { @@ -3885,8 +4027,6 @@ "mode": "thresholds" }, "unit": "none", - "min": 0, - "max": null, "thresholds": { "mode": "absolute", "steps": [ @@ -3894,45 +4034,34 @@ "color": "dark-green", "value": null }, - { - "color": "dark-yellow", - "value": 1 - }, - { - "color": "dark-orange", - "value": 3 - }, { "color": "dark-red", - "value": 5 + "value": 1 } ] - } + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 1 }, "overrides": [] }, "options": { - "displayMode": "basic", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "mergeValues": true, + "showValue": "never", + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + } } ] } @@ -4023,7 +4152,7 @@ "name": "branch", "label": "Branch", "type": "query", - "query": "label_values(platform_quality_gate_build_info{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}, branch)", + "query": "label_values(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",exported_job=\"platform-quality-ci\"}, branch)", "current": { "text": "All", "value": "$__all", @@ -4042,7 +4171,7 @@ "name": "test", "label": "Test Case", "type": "query", - "query": "label_values(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\"}, test)", + "query": "label_values(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\"}, test)", "current": { "text": "All", "value": "$__all", diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index 47939014..33a87bf7 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -830,8 +830,6 @@ "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -861,7 +859,9 @@ "fillOpacity": 70, "lineWidth": 0, "spanNulls": true - } + }, + "min": 0, + "max": 100 }, "overrides": [] }, @@ -960,8 +960,6 @@ "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -991,7 +989,9 @@ "fillOpacity": 70, "lineWidth": 0, "spanNulls": true - } + }, + "min": 0, + "max": 100 }, "overrides": [] }, @@ -1036,8 +1036,6 @@ "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -1067,7 +1065,9 @@ "fillOpacity": 70, "lineWidth": 0, "spanNulls": true - } + }, + "min": 0, + "max": 100 }, "overrides": [] }, @@ -1100,8 +1100,9 @@ "panels": [ { "id": 130, - "type": "timeseries", + "type": "state-timeline", "title": "Tests Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1121,6 +1122,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1143,31 +1147,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 131, - "type": "timeseries", + "type": "state-timeline", "title": "Coverage Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1187,6 +1194,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1209,31 +1219,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 132, - "type": "timeseries", + "type": "state-timeline", "title": "LOC Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1253,6 +1266,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1275,31 +1291,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 133, - "type": "timeseries", + "type": "state-timeline", "title": "Style Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1319,6 +1338,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1341,31 +1363,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 134, - "type": "timeseries", + "type": "state-timeline", "title": "Gate Glue Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1385,6 +1410,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1407,31 +1435,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 135, - "type": "timeseries", + "type": "state-timeline", "title": "SonarQube Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1451,6 +1482,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1473,31 +1507,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 136, - "type": "timeseries", + "type": "state-timeline", "title": "Supply Chain Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1517,6 +1554,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1539,26 +1579,28 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } } ] }, @@ -1576,8 +1618,9 @@ "panels": [ { "id": 138, - "type": "timeseries", + "type": "state-timeline", "title": "Tests Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1597,6 +1640,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1623,31 +1669,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 139, - "type": "timeseries", + "type": "state-timeline", "title": "Coverage Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1667,6 +1716,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1693,31 +1745,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 140, - "type": "timeseries", + "type": "state-timeline", "title": "LOC Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1737,6 +1792,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1763,31 +1821,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 141, - "type": "timeseries", + "type": "state-timeline", "title": "Style Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1807,6 +1868,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1833,31 +1897,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 142, - "type": "timeseries", + "type": "state-timeline", "title": "Gate Glue Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1877,6 +1944,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1903,31 +1973,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 143, - "type": "timeseries", + "type": "state-timeline", "title": "SonarQube Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1947,6 +2020,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1973,31 +2049,34 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 144, - "type": "timeseries", + "type": "state-timeline", "title": "Supply Chain Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2017,6 +2096,9 @@ ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -2043,26 +2125,28 @@ } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } } ] }, @@ -2080,8 +2164,9 @@ "panels": [ { "id": 145, - "type": "timeseries", + "type": "state-timeline", "title": "Problematic Tests Over Time (Top failures)", + "description": "Top failing test cases over time, using memoized hourly rollups. Blank branch/test labels and placeholder no-test-case rows are excluded.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2094,14 +2179,44 @@ }, "targets": [ { - "expr": "(topk(12, ((sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",status=\"failed\"})) or on(suite, test) (sum by (suite, test) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[$__interval])))))) or on() vector(0)", + "expr": "topk(12, ((sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"})) or on(suite, test) (sum by (suite, test) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[$__interval])))))", "refId": "A", "legendFormat": "{{suite}} - {{test}}" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "none", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-green", + "value": null + }, + { + "color": "dark-yellow", + "value": 1 + }, + { + "color": "dark-orange", + "value": 3 + }, + { + "color": "dark-red", + "value": 5 + } + ] + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, "links": [ { "title": "Open latest artifacts", @@ -2118,12 +2233,15 @@ "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", - "placement": "right" + "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } }, "links": [ @@ -2240,7 +2358,7 @@ }, "targets": [ { - "expr": "sort_desc((topk by (suite) (1, ((sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",status=\"failed\"}[30d:1h]))) or on(suite, test) (sum by (suite, test) (increase(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[30d])))))) or on() vector(0))", + "expr": "sort_desc(topk by (suite) (1, ((sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[30d:1h]))) or on(suite, test) (sum by (suite, test) (increase(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[30d]))))))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{test}}", "instant": true @@ -2433,18 +2551,18 @@ "targets": [ { "refId": "A", - "expr": "((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"passed\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"passed\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", - "legendFormat": "{{suite}} passed" + "expr": "(sum(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"passed\"}) or on() vector(0))", + "legendFormat": "Passed" }, { "refId": "B", - "expr": "((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"failed\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", - "legendFormat": "{{suite}} failed" + "expr": "(sum(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"failed\"}) or on() vector(0))", + "legendFormat": "Failed" }, { "refId": "C", - "expr": "((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"skipped\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"skipped\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", - "legendFormat": "{{suite}} skipped" + "expr": "(sum(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"skipped\"}) or on() vector(0))", + "legendFormat": "Skipped" } ], "fieldConfig": { @@ -2461,7 +2579,18 @@ "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/", "targetBlank": true } - ] + ], + "min": 0, + "custom": { + "drawStyle": "bars", + "barAlignment": 0, + "lineWidth": 0, + "fillOpacity": 70, + "stacking": { + "mode": "normal", + "group": "A" + } + } }, "overrides": [] }, @@ -2570,12 +2699,14 @@ "url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/", "targetBlank": true } - ] + ], + "description": "Stacked hourly outcome volume for the selected suite/branch/test scope. This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans." }, { "id": 152, - "type": "timeseries", + "type": "state-timeline", "title": "Selected Test Pass Rate History", + "description": "Average pass rate per suite for the selected test filter, using memoized hourly test-case pass-rate rollups instead of raw historical scans.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2588,28 +2719,17 @@ }, "targets": [ { - "expr": "((100 * (((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"passed\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"passed\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))) / clamp_min(((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=~\"passed|failed|error|skipped\"}) or on(suite) sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=~\"passed|failed|error|skipped\"}[$__interval])))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", + "expr": "avg by (suite) (platform_quality:test_case_pass_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\"})", "refId": "A", "legendFormat": "{{suite}}" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", - "links": [ - { - "title": "Open build artifacts", - "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/artifact/", - "targetBlank": true - }, - { - "title": "Open build", - "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/", - "targetBlank": true - } - ], - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -2634,17 +2754,39 @@ "value": 100 } ] - } + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100, + "links": [ + { + "title": "Open build artifacts", + "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/artifact/", + "targetBlank": true + }, + { + "title": "Open build", + "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/", + "targetBlank": true + } + ] }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } }, "links": [ @@ -3267,7 +3409,7 @@ }, "targets": [ { - "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d])))", + "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d])))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{branch}}", "instant": true @@ -3434,7 +3576,7 @@ }, "targets": [ { - "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[30d])))", + "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[30d])))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{branch}}", "instant": true @@ -3684,7 +3826,7 @@ }, "targets": [ { - "expr": "(count(sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"}) or on() vector(0))", + "expr": "(count(max by (project_key) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"})) or on() vector(0))", "refId": "A", "instant": true } @@ -3816,13 +3958,13 @@ }, "gridPos": { "h": 6, - "w": 6, + "w": 4, "x": 12, "y": 111 }, "targets": [ { - "expr": "count by (status) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"})", + "expr": "count by (status) (max by (project_key, status) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"}))", "refId": "A", "legendFormat": "{{status}}" } @@ -3859,24 +4001,24 @@ }, { "id": 35, - "type": "bargauge", + "type": "state-timeline", "title": "Projects Failing Sonar Gate", + "description": "Projects observed with a non-OK SonarQube gate status over time. The query deduplicates pod/service endpoint scrapes before rendering.", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, - "w": 6, - "x": 18, + "w": 8, + "x": 16, "y": 111 }, "targets": [ { - "expr": "sort_desc((sort_desc(count by (project_key) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\",status!~\"OK|ok\"})) or on() label_replace(vector(0), \"project_key\", \"none\", \"__name__\", \".*\")))", + "expr": "max by (project_key) ((max by (project_key, status) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\",status!~\"OK|ok\"})) * 0 + 1)", "refId": "A", - "legendFormat": "{{project_key}}", - "instant": true + "legendFormat": "{{project_key}}" } ], "fieldConfig": { @@ -3885,8 +4027,6 @@ "mode": "thresholds" }, "unit": "none", - "min": 0, - "max": null, "thresholds": { "mode": "absolute", "steps": [ @@ -3894,45 +4034,34 @@ "color": "dark-green", "value": null }, - { - "color": "dark-yellow", - "value": 1 - }, - { - "color": "dark-orange", - "value": 3 - }, { "color": "dark-red", - "value": 5 + "value": 1 } ] - } + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 1 }, "overrides": [] }, "options": { - "displayMode": "basic", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "mergeValues": true, + "showValue": "never", + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + } } ] } @@ -4023,7 +4152,7 @@ "name": "branch", "label": "Branch", "type": "query", - "query": "label_values(platform_quality_gate_build_info{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}, branch)", + "query": "label_values(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",exported_job=\"platform-quality-ci\"}, branch)", "current": { "text": "All", "value": "$__all", @@ -4042,7 +4171,7 @@ "name": "test", "label": "Test Case", "type": "query", - "query": "label_values(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\"}, test)", + "query": "label_values(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\"}, test)", "current": { "text": "All", "value": "$__all", diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 6bf593a0..b916d722 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -839,8 +839,6 @@ data: "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -870,7 +868,9 @@ data: "fillOpacity": 70, "lineWidth": 0, "spanNulls": true - } + }, + "min": 0, + "max": 100 }, "overrides": [] }, @@ -969,8 +969,6 @@ data: "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -1000,7 +998,9 @@ data: "fillOpacity": 70, "lineWidth": 0, "spanNulls": true - } + }, + "min": 0, + "max": 100 }, "overrides": [] }, @@ -1045,8 +1045,6 @@ data: "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -1076,7 +1074,9 @@ data: "fillOpacity": 70, "lineWidth": 0, "spanNulls": true - } + }, + "min": 0, + "max": 100 }, "overrides": [] }, @@ -1109,8 +1109,9 @@ data: "panels": [ { "id": 130, - "type": "timeseries", + "type": "state-timeline", "title": "Tests Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1130,6 +1131,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1152,31 +1156,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 131, - "type": "timeseries", + "type": "state-timeline", "title": "Coverage Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1196,6 +1203,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1218,31 +1228,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 132, - "type": "timeseries", + "type": "state-timeline", "title": "LOC Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1262,6 +1275,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1284,31 +1300,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 133, - "type": "timeseries", + "type": "state-timeline", "title": "Style Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1328,6 +1347,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1350,31 +1372,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 134, - "type": "timeseries", + "type": "state-timeline", "title": "Gate Glue Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1394,6 +1419,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1416,31 +1444,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 135, - "type": "timeseries", + "type": "state-timeline", "title": "SonarQube Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1460,6 +1491,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1482,31 +1516,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 136, - "type": "timeseries", + "type": "state-timeline", "title": "Supply Chain Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1526,6 +1563,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1548,26 +1588,28 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } } ] }, @@ -1585,8 +1627,9 @@ data: "panels": [ { "id": 138, - "type": "timeseries", + "type": "state-timeline", "title": "Tests Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1606,6 +1649,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1632,31 +1678,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 139, - "type": "timeseries", + "type": "state-timeline", "title": "Coverage Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1676,6 +1725,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1702,31 +1754,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 140, - "type": "timeseries", + "type": "state-timeline", "title": "LOC Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1746,6 +1801,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1772,31 +1830,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 141, - "type": "timeseries", + "type": "state-timeline", "title": "Style Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1816,6 +1877,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1842,31 +1906,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 142, - "type": "timeseries", + "type": "state-timeline", "title": "Gate Glue Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1886,6 +1953,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1912,31 +1982,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 143, - "type": "timeseries", + "type": "state-timeline", "title": "SonarQube Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1956,6 +2029,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1982,31 +2058,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 144, - "type": "timeseries", + "type": "state-timeline", "title": "Supply Chain Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2026,6 +2105,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -2052,26 +2134,28 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } } ] }, @@ -2089,8 +2173,9 @@ data: "panels": [ { "id": 145, - "type": "timeseries", + "type": "state-timeline", "title": "Problematic Tests Over Time (Top failures)", + "description": "Top failing test cases over time, using memoized hourly rollups. Blank branch/test labels and placeholder no-test-case rows are excluded.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2103,14 +2188,44 @@ data: }, "targets": [ { - "expr": "(topk(12, ((sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",status=\"failed\"})) or on(suite, test) (sum by (suite, test) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[$__interval])))))) or on() vector(0)", + "expr": "topk(12, ((sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"})) or on(suite, test) (sum by (suite, test) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[$__interval])))))", "refId": "A", "legendFormat": "{{suite}} - {{test}}" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "none", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-green", + "value": null + }, + { + "color": "dark-yellow", + "value": 1 + }, + { + "color": "dark-orange", + "value": 3 + }, + { + "color": "dark-red", + "value": 5 + } + ] + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, "links": [ { "title": "Open latest artifacts", @@ -2127,12 +2242,15 @@ data: "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", - "placement": "right" + "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } }, "links": [ @@ -2249,7 +2367,7 @@ data: }, "targets": [ { - "expr": "sort_desc((topk by (suite) (1, ((sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",status=\"failed\"}[30d:1h]))) or on(suite, test) (sum by (suite, test) (increase(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[30d])))))) or on() vector(0))", + "expr": "sort_desc(topk by (suite) (1, ((sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[30d:1h]))) or on(suite, test) (sum by (suite, test) (increase(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[30d]))))))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{test}}", "instant": true @@ -2442,18 +2560,18 @@ data: "targets": [ { "refId": "A", - "expr": "((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"passed\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"passed\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", - "legendFormat": "{{suite}} passed" + "expr": "(sum(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"passed\"}) or on() vector(0))", + "legendFormat": "Passed" }, { "refId": "B", - "expr": "((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"failed\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", - "legendFormat": "{{suite}} failed" + "expr": "(sum(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"failed\"}) or on() vector(0))", + "legendFormat": "Failed" }, { "refId": "C", - "expr": "((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"skipped\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"skipped\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", - "legendFormat": "{{suite}} skipped" + "expr": "(sum(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"skipped\"}) or on() vector(0))", + "legendFormat": "Skipped" } ], "fieldConfig": { @@ -2470,7 +2588,18 @@ data: "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/", "targetBlank": true } - ] + ], + "min": 0, + "custom": { + "drawStyle": "bars", + "barAlignment": 0, + "lineWidth": 0, + "fillOpacity": 70, + "stacking": { + "mode": "normal", + "group": "A" + } + } }, "overrides": [] }, @@ -2579,12 +2708,14 @@ data: "url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/", "targetBlank": true } - ] + ], + "description": "Stacked hourly outcome volume for the selected suite/branch/test scope. This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans." }, { "id": 152, - "type": "timeseries", + "type": "state-timeline", "title": "Selected Test Pass Rate History", + "description": "Average pass rate per suite for the selected test filter, using memoized hourly test-case pass-rate rollups instead of raw historical scans.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2597,28 +2728,17 @@ data: }, "targets": [ { - "expr": "((100 * (((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"passed\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"passed\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))) / clamp_min(((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=~\"passed|failed|error|skipped\"}) or on(suite) sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=~\"passed|failed|error|skipped\"}[$__interval])))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", + "expr": "avg by (suite) (platform_quality:test_case_pass_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\"})", "refId": "A", "legendFormat": "{{suite}}" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", - "links": [ - { - "title": "Open build artifacts", - "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/artifact/", - "targetBlank": true - }, - { - "title": "Open build", - "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/", - "targetBlank": true - } - ], - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -2643,17 +2763,39 @@ data: "value": 100 } ] - } + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100, + "links": [ + { + "title": "Open build artifacts", + "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/artifact/", + "targetBlank": true + }, + { + "title": "Open build", + "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/", + "targetBlank": true + } + ] }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } }, "links": [ @@ -3276,7 +3418,7 @@ data: }, "targets": [ { - "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d])))", + "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d])))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{branch}}", "instant": true @@ -3443,7 +3585,7 @@ data: }, "targets": [ { - "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[30d])))", + "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[30d])))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{branch}}", "instant": true @@ -3693,7 +3835,7 @@ data: }, "targets": [ { - "expr": "(count(sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"}) or on() vector(0))", + "expr": "(count(max by (project_key) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"})) or on() vector(0))", "refId": "A", "instant": true } @@ -3825,13 +3967,13 @@ data: }, "gridPos": { "h": 6, - "w": 6, + "w": 4, "x": 12, "y": 111 }, "targets": [ { - "expr": "count by (status) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"})", + "expr": "count by (status) (max by (project_key, status) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"}))", "refId": "A", "legendFormat": "{{status}}" } @@ -3868,24 +4010,24 @@ data: }, { "id": 35, - "type": "bargauge", + "type": "state-timeline", "title": "Projects Failing Sonar Gate", + "description": "Projects observed with a non-OK SonarQube gate status over time. The query deduplicates pod/service endpoint scrapes before rendering.", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, - "w": 6, - "x": 18, + "w": 8, + "x": 16, "y": 111 }, "targets": [ { - "expr": "sort_desc((sort_desc(count by (project_key) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\",status!~\"OK|ok\"})) or on() label_replace(vector(0), \"project_key\", \"none\", \"__name__\", \".*\")))", + "expr": "max by (project_key) ((max by (project_key, status) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\",status!~\"OK|ok\"})) * 0 + 1)", "refId": "A", - "legendFormat": "{{project_key}}", - "instant": true + "legendFormat": "{{project_key}}" } ], "fieldConfig": { @@ -3894,8 +4036,6 @@ data: "mode": "thresholds" }, "unit": "none", - "min": 0, - "max": null, "thresholds": { "mode": "absolute", "steps": [ @@ -3903,45 +4043,34 @@ data: "color": "dark-green", "value": null }, - { - "color": "dark-yellow", - "value": 1 - }, - { - "color": "dark-orange", - "value": 3 - }, { "color": "dark-red", - "value": 5 + "value": 1 } ] - } + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 1 }, "overrides": [] }, "options": { - "displayMode": "basic", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "mergeValues": true, + "showValue": "never", + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + } } ] } @@ -4032,7 +4161,7 @@ data: "name": "branch", "label": "Branch", "type": "query", - "query": "label_values(platform_quality_gate_build_info{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}, branch)", + "query": "label_values(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",exported_job=\"platform-quality-ci\"}, branch)", "current": { "text": "All", "value": "$__all", @@ -4051,7 +4180,7 @@ data: "name": "test", "label": "Test Case", "type": "query", - "query": "label_values(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\"}, test)", + "query": "label_values(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\"}, test)", "current": { "text": "All", "value": "$__all", diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 9f4f1617..f9ebce18 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -839,8 +839,6 @@ data: "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -870,7 +868,9 @@ data: "fillOpacity": 70, "lineWidth": 0, "spanNulls": true - } + }, + "min": 0, + "max": 100 }, "overrides": [] }, @@ -969,8 +969,6 @@ data: "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -1000,7 +998,9 @@ data: "fillOpacity": 70, "lineWidth": 0, "spanNulls": true - } + }, + "min": 0, + "max": 100 }, "overrides": [] }, @@ -1045,8 +1045,6 @@ data: "mode": "thresholds" }, "unit": "percent", - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -1076,7 +1074,9 @@ data: "fillOpacity": 70, "lineWidth": 0, "spanNulls": true - } + }, + "min": 0, + "max": 100 }, "overrides": [] }, @@ -1109,8 +1109,9 @@ data: "panels": [ { "id": 130, - "type": "timeseries", + "type": "state-timeline", "title": "Tests Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1130,6 +1131,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1152,31 +1156,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 131, - "type": "timeseries", + "type": "state-timeline", "title": "Coverage Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1196,6 +1203,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1218,31 +1228,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 132, - "type": "timeseries", + "type": "state-timeline", "title": "LOC Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1262,6 +1275,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1284,31 +1300,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 133, - "type": "timeseries", + "type": "state-timeline", "title": "Style Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1328,6 +1347,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1350,31 +1372,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 134, - "type": "timeseries", + "type": "state-timeline", "title": "Gate Glue Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1394,6 +1419,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1416,31 +1444,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 135, - "type": "timeseries", + "type": "state-timeline", "title": "SonarQube Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1460,6 +1491,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1482,31 +1516,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } }, { "id": 136, - "type": "timeseries", + "type": "state-timeline", "title": "Supply Chain Failure Rate", + "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1526,6 +1563,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1548,26 +1588,28 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current bad-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." + } } ] }, @@ -1585,8 +1627,9 @@ data: "panels": [ { "id": 138, - "type": "timeseries", + "type": "state-timeline", "title": "Tests Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1606,6 +1649,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1632,31 +1678,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 139, - "type": "timeseries", + "type": "state-timeline", "title": "Coverage Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1676,6 +1725,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1702,31 +1754,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 140, - "type": "timeseries", + "type": "state-timeline", "title": "LOC Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1746,6 +1801,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1772,31 +1830,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 141, - "type": "timeseries", + "type": "state-timeline", "title": "Style Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1816,6 +1877,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1842,31 +1906,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 142, - "type": "timeseries", + "type": "state-timeline", "title": "Gate Glue Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1886,6 +1953,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1912,31 +1982,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 143, - "type": "timeseries", + "type": "state-timeline", "title": "SonarQube Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1956,6 +2029,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -1982,31 +2058,34 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } }, { "id": 144, - "type": "timeseries", + "type": "state-timeline", "title": "Supply Chain Healthy Rate", + "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2026,6 +2105,9 @@ data: ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", "thresholds": { "mode": "absolute", @@ -2052,26 +2134,28 @@ data: } ] }, - "min": 0, - "max": 100, "custom": { - "spanNulls": true, - "showPoints": "never", - "lineWidth": 2 - } + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100 }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } - }, - "description": "Current acceptable-state percentage for this check family, evaluated over time. Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." + } } ] }, @@ -2089,8 +2173,9 @@ data: "panels": [ { "id": 145, - "type": "timeseries", + "type": "state-timeline", "title": "Problematic Tests Over Time (Top failures)", + "description": "Top failing test cases over time, using memoized hourly rollups. Blank branch/test labels and placeholder no-test-case rows are excluded.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2103,14 +2188,44 @@ data: }, "targets": [ { - "expr": "(topk(12, ((sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",status=\"failed\"})) or on(suite, test) (sum by (suite, test) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[$__interval])))))) or on() vector(0)", + "expr": "topk(12, ((sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"})) or on(suite, test) (sum by (suite, test) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[$__interval])))))", "refId": "A", "legendFormat": "{{suite}} - {{test}}" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "none", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-green", + "value": null + }, + { + "color": "dark-yellow", + "value": 1 + }, + { + "color": "dark-orange", + "value": 3 + }, + { + "color": "dark-red", + "value": 5 + } + ] + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, "links": [ { "title": "Open latest artifacts", @@ -2127,12 +2242,15 @@ data: "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", - "placement": "right" + "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } }, "links": [ @@ -2249,7 +2367,7 @@ data: }, "targets": [ { - "expr": "sort_desc((topk by (suite) (1, ((sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",status=\"failed\"}[30d:1h]))) or on(suite, test) (sum by (suite, test) (increase(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[30d])))))) or on() vector(0))", + "expr": "sort_desc(topk by (suite) (1, ((sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[30d:1h]))) or on(suite, test) (sum by (suite, test) (increase(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[30d]))))))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{test}}", "instant": true @@ -2442,18 +2560,18 @@ data: "targets": [ { "refId": "A", - "expr": "((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"passed\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"passed\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", - "legendFormat": "{{suite}} passed" + "expr": "(sum(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"passed\"}) or on() vector(0))", + "legendFormat": "Passed" }, { "refId": "B", - "expr": "((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"failed\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"failed\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", - "legendFormat": "{{suite}} failed" + "expr": "(sum(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"failed\"}) or on() vector(0))", + "legendFormat": "Failed" }, { "refId": "C", - "expr": "((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"skipped\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"skipped\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", - "legendFormat": "{{suite}} skipped" + "expr": "(sum(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"skipped\"}) or on() vector(0))", + "legendFormat": "Skipped" } ], "fieldConfig": { @@ -2470,7 +2588,18 @@ data: "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/", "targetBlank": true } - ] + ], + "min": 0, + "custom": { + "drawStyle": "bars", + "barAlignment": 0, + "lineWidth": 0, + "fillOpacity": 70, + "stacking": { + "mode": "normal", + "group": "A" + } + } }, "overrides": [] }, @@ -2579,12 +2708,14 @@ data: "url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/", "targetBlank": true } - ] + ], + "description": "Stacked hourly outcome volume for the selected suite/branch/test scope. This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans." }, { "id": 152, - "type": "timeseries", + "type": "state-timeline", "title": "Selected Test Pass Rate History", + "description": "Average pass rate per suite for the selected test filter, using memoized hourly test-case pass-rate rollups instead of raw historical scans.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2597,28 +2728,17 @@ data: }, "targets": [ { - "expr": "((100 * (((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=\"passed\"})) or on(suite) (sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=\"passed\"}[$__interval]))) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))) / clamp_min(((sum by (suite) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",status=~\"passed|failed|error|skipped\"}) or on(suite) sum by (suite) (max_over_time(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test=~\"${test:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\",status=~\"passed|failed|error|skipped\"}[$__interval])))), 1)) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))))", + "expr": "avg by (suite) (platform_quality:test_case_pass_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test=~\"${test:regex}\",test!=\"__no_test_cases__\"})", "refId": "A", "legendFormat": "{{suite}}" } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "unit": "percent", - "links": [ - { - "title": "Open build artifacts", - "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/artifact/", - "targetBlank": true - }, - { - "title": "Open build", - "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/", - "targetBlank": true - } - ], - "min": 0, - "max": 100, "thresholds": { "mode": "absolute", "steps": [ @@ -2643,17 +2763,39 @@ data: "value": 100 } ] - } + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 100, + "links": [ + { + "title": "Open build artifacts", + "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/artifact/", + "targetBlank": true + }, + { + "title": "Open build", + "url": "${jenkins_base}/job/${__field.labels.jenkins_job}/${__field.labels.build_number}/", + "targetBlank": true + } + ] }, "overrides": [] }, "options": { + "mergeValues": true, + "showValue": "never", "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { - "mode": "multi" + "mode": "single", + "sort": "none" } }, "links": [ @@ -3276,7 +3418,7 @@ data: }, "targets": [ { - "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d])))", + "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d])))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{branch}}", "instant": true @@ -3443,7 +3585,7 @@ data: }, "targets": [ { - "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[30d])))", + "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[30d])))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{branch}}", "instant": true @@ -3693,7 +3835,7 @@ data: }, "targets": [ { - "expr": "(count(sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"}) or on() vector(0))", + "expr": "(count(max by (project_key) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"})) or on() vector(0))", "refId": "A", "instant": true } @@ -3825,13 +3967,13 @@ data: }, "gridPos": { "h": 6, - "w": 6, + "w": 4, "x": 12, "y": 111 }, "targets": [ { - "expr": "count by (status) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"})", + "expr": "count by (status) (max by (project_key, status) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\"}))", "refId": "A", "legendFormat": "{{status}}" } @@ -3868,24 +4010,24 @@ data: }, { "id": 35, - "type": "bargauge", + "type": "state-timeline", "title": "Projects Failing Sonar Gate", + "description": "Projects observed with a non-OK SonarQube gate status over time. The query deduplicates pod/service endpoint scrapes before rendering.", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, - "w": 6, - "x": 18, + "w": 8, + "x": 16, "y": 111 }, "targets": [ { - "expr": "sort_desc((sort_desc(count by (project_key) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\",status!~\"OK|ok\"})) or on() label_replace(vector(0), \"project_key\", \"none\", \"__name__\", \".*\")))", + "expr": "max by (project_key) ((max by (project_key, status) (sonarqube_project_quality_gate_pass{project_key=~\"${suite:regex}\",status!~\"OK|ok\"})) * 0 + 1)", "refId": "A", - "legendFormat": "{{project_key}}", - "instant": true + "legendFormat": "{{project_key}}" } ], "fieldConfig": { @@ -3894,8 +4036,6 @@ data: "mode": "thresholds" }, "unit": "none", - "min": 0, - "max": null, "thresholds": { "mode": "absolute", "steps": [ @@ -3903,45 +4043,34 @@ data: "color": "dark-green", "value": null }, - { - "color": "dark-yellow", - "value": 1 - }, - { - "color": "dark-orange", - "value": 3 - }, { "color": "dark-red", - "value": 5 + "value": 1 } ] - } + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": true + }, + "min": 0, + "max": 1 }, "overrides": [] }, "options": { - "displayMode": "basic", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "mergeValues": true, + "showValue": "never", + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + } } ] } @@ -4032,7 +4161,7 @@ data: "name": "branch", "label": "Branch", "type": "query", - "query": "label_values(platform_quality_gate_build_info{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}, branch)", + "query": "label_values(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",exported_job=\"platform-quality-ci\"}, branch)", "current": { "text": "All", "value": "$__all", @@ -4051,7 +4180,7 @@ data: "name": "test", "label": "Test Case", "type": "query", - "query": "label_values(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch=~\"${branch:regex}\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\"}, test)", + "query": "label_values(platform_quality_gate_test_case_result{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",exported_job=\"platform-quality-ci\"}, test)", "current": { "text": "All", "value": "$__all", diff --git a/services/monitoring/vmalert-atlas-availability.yaml b/services/monitoring/vmalert-atlas-availability.yaml index b80525eb..c354d02d 100644 --- a/services/monitoring/vmalert-atlas-availability.yaml +++ b/services/monitoring/vmalert-atlas-availability.yaml @@ -147,7 +147,7 @@ data: - record: platform_quality:test_case_status:count_1h expr: | sum by (suite, branch, test, status) ( - max_over_time(platform_quality_gate_test_case_result{exported_job="platform-quality-ci",test!="__no_test_cases__"}[1h]) + max_over_time(platform_quality_gate_test_case_result{exported_job="platform-quality-ci",branch!="",test!="",test!="__no_test_cases__"}[1h]) ) labels: rollup: hourly @@ -155,13 +155,13 @@ data: expr: | 100 * ( sum by (suite, branch, test) ( - max_over_time(platform_quality_gate_test_case_result{exported_job="platform-quality-ci",test!="__no_test_cases__",status="passed"}[1h]) + max_over_time(platform_quality_gate_test_case_result{exported_job="platform-quality-ci",branch!="",test!="",test!="__no_test_cases__",status="passed"}[1h]) ) ) / clamp_min( sum by (suite, branch, test) ( - max_over_time(platform_quality_gate_test_case_result{exported_job="platform-quality-ci",test!="__no_test_cases__",status=~"passed|failed|error|skipped"}[1h]) + max_over_time(platform_quality_gate_test_case_result{exported_job="platform-quality-ci",branch!="",test!="",test!="__no_test_cases__",status=~"passed|failed|error|skipped"}[1h]) ), 1 ) @@ -196,7 +196,7 @@ spec: labels: app: vmalert-atlas-availability annotations: - bstein.dev/rules-revision: "2026-05-15-platform-quality-rollups-v2" + bstein.dev/rules-revision: "2026-05-15-platform-quality-rollups-v3" spec: serviceAccountName: vmalert-atlas-availability affinity: