diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 37ee84f0..daf147fe 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -608,6 +608,7 @@ PLATFORM_TEST_SOURCE_LINES_OVER_500_ROLLUP = ( "platform_quality:suite_source_lines_over_500_total:latest_1h" ) PLATFORM_TEST_SONAR_HEALTH_ROLLUP = "platform_quality:sonar_gate_health_percent:latest_1h" +PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP = "platform_quality:test_category_health_rate:percent_1h" PLATFORM_TEST_SUCCESS_EVENTS_30D = ( f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")}) or on() vector(0))' ) @@ -803,10 +804,10 @@ PLATFORM_TEST_CURRENT_GATE_CHECK_HEALTH_BY_SUITE = ( f'/ clamp_min(sum by (suite) ({PLATFORM_TEST_CHECK_ROLLUP_SEEN_FLAGS}), 1))' ) PLATFORM_TEST_CATEGORY_HEALTH_BY_SUITE = ( - 'min by (suite) (platform_quality:test_case_health_rate:percent_1h{' + f'min by (suite) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{' f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",branch!="",' - f'branch=~"main|master|origin/main|origin/master",test!="",' - f'test!="__no_test_cases__",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"' + f'branch=~"main|master|origin/main|origin/master",' + f'category=~"{PLATFORM_TEST_CATEGORY_REGEX}"' "})" ) PLATFORM_TEST_CURRENT_GATE_HEALTH_OBSERVED_BY_SUITE = ( @@ -1843,7 +1844,7 @@ TESTING_PANEL_DESCRIPTIONS = { "SonarQube Healthy Rate": "Percent of Sonar checks passing or not applicable; higher is better.", "Supply Chain Healthy Rate": "Percent of supply-chain checks passing or not applicable; higher is better.", "Test Drilldowns And Problem Tests": "Test-case detail for finding which tests are hurting reliability.", - "Problematic Tests Over Time (Top failures)": "Top tests failing in each hourly bucket; old totals may only appear in the 30d panel.", + "Problematic Tests Over Time (Top failures)": "Current outlier tests by rolling 24h failures; tests need repeat failures to stay visible.", "Most Problematic Test by Suite (30d)": "Worst test per suite summed over 30d; high counts can be historical debt.", "Selected Test Pass/Fail History": "Hourly pass/fail/skipped volume for the selected test filter.", "Selected Test Pass Rate History": "Pass rate history for the selected test filter; higher means the test is stable.", @@ -2217,9 +2218,9 @@ def build_overview(): } overview_avg_coverage = f"(avg(({QUALITY_GATE_COVERAGE_BY_SUITE})) or on() vector(0))" overview_category_health = ( - 'avg by (category) (platform_quality:test_case_health_rate:percent_1h{' + f'avg by (category) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{' f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",branch!="",branch=~"main|master|origin/main|origin/master",' - f'test!="",test!="__no_test_cases__",category=~"{PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"' + f'category=~"{PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"' "})" ) for panel_id, title, draw_expr, runtime_expr, y_pos in [ @@ -3972,9 +3973,8 @@ def build_jobs_dashboard(): f"(100 * ({current_gate_ok_checks}) / clamp_min(({current_gate_seen_checks}), 1))" ) current_category_health_by_suite = ( - f'min by (suite) (platform_quality:test_case_health_rate:percent_1h{{suite=~"{suite_var}",' - f'branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",' - f'category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})' + f'min by (suite) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{suite=~"{suite_var}",' + f'branch!="",branch=~"{branch_var}",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})' ) current_gate_health_observed_by_suite = ( f"min by (suite) (({current_gate_check_health_by_suite}) " @@ -4064,10 +4064,20 @@ def build_jobs_dashboard(): state_percent = f"(100 * ({state_checks}) / clamp_min(({total_checks}), 1))" return f"(({state_percent}) or on(suite) ({selected_suite_zero}))" - rollup_failed_tests = ( - f'sum by (suite, test) (platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}})' + failed_test_status_selector = ( + f'platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",' + f'branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}' + ) + rollup_failed_tests = ( + f"sum by (suite, test) (sum_over_time({failed_test_status_selector}[24h:1h]))" + ) + current_problem_test_candidates = ( + f"sum by (suite, test) (sum_over_time({failed_test_status_selector}[24h:1h] @ end()))" + ) + problematic_tests_history_core = ( + f"({rollup_failed_tests}) " + f"and on (suite, test) topk(12, ({current_problem_test_candidates}) >= 2)" ) - problematic_tests_history_core = f"topk(12, ({rollup_failed_tests}))" problematic_tests_history = problematic_tests_history_core rollup_failed_tests_30d = ( f'sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}[30d:1h]))' @@ -4106,9 +4116,8 @@ def build_jobs_dashboard(): f'branch!="",branch=~"{branch_var}",test!="",test=~"{test_var}",test!="__no_test_cases__"}})' ) category_pass_rate_history = ( - f'avg by (category) (platform_quality:test_case_health_rate:percent_1h{{suite=~"{suite_var}",' - f'branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",' - f'category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})' + f'avg by (category) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{suite=~"{suite_var}",' + f'branch!="",branch=~"{branch_var}",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})' ) recent_branch_evidence = ( f'sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[30d:15m])))' @@ -4467,8 +4476,8 @@ def build_jobs_dashboard(): max_value=None, legend="{{suite}} - {{test}}", description=( - "Top failing tests inside each hourly bucket. Short current bars can still belong to tests " - "with larger long-window totals." + "Current outlier tests by rolling 24h failure count. A test needs at least two recent " + "failures to appear, then falls off once it quiets down." ), ) ) @@ -4544,7 +4553,7 @@ def build_jobs_dashboard(): thresholds=success_thresholds, legend="{{category}}", description=( - "Health by test category from current per-test metrics. Use the Suite filter to focus one " + "Health by test category from memoized hourly rollups. Use the Suite filter to focus one " "project; skipped tests are healthy, while failures and errors lower the lane." ), ) diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index 5cfb9339..44e6fa50 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -79,7 +79,7 @@ def test_overview_uses_readable_quality_power_and_gitops_panels(): assert panels_by_title["Test Category Health"]["targets"][0]["range"] is True assert "${overview_suite:regex}" not in panels_by_title["Test Category Health"]["targets"][0]["expr"] assert mod.PLATFORM_TEST_SUITE_CANONICAL_MATCHER in panels_by_title["Test Category Health"]["targets"][0]["expr"] - assert "platform_quality:test_case_health_rate:percent_1h" in panels_by_title["Test Category Health"]["targets"][0]["expr"] + assert "platform_quality:test_category_health_rate:percent_1h" in panels_by_title["Test Category Health"]["targets"][0]["expr"] assert panels_by_title["Test Category Health"]["timeFrom"] == "24h" assert f'category=~"{mod.PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"' in panels_by_title["Test Category Health"]["targets"][0]["expr"] assert "manual" not in mod.PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX @@ -127,7 +127,7 @@ def test_overview_uses_readable_quality_power_and_gitops_panels(): assert panels_by_title["Flux Source"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 7} assert panels_by_title["Flux Source"]["targets"][0]["legendFormat"] == "{{branch}}" assert panels_by_title["Current Gate Health"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 9} - assert "platform_quality:test_case_health_rate:percent_1h" in panels_by_title["Current Gate Health"]["targets"][0]["expr"] + assert "platform_quality:test_category_health_rate:percent_1h" in panels_by_title["Current Gate Health"]["targets"][0]["expr"] assert panels_by_title["CI Run Success (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 11} assert panels_by_title["Suites With Runs (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 15} suites_reporting_expr = panels_by_title["Suites With Runs (24h)"]["targets"][0]["expr"] @@ -245,7 +245,7 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability(): current_gate_expr = panels_by_title["Latest Gate Health by Suite"]["targets"][0]["expr"] assert 'check)' in current_gate_expr assert "platform_quality:check_status:present_1h" in current_gate_expr - assert "platform_quality:test_case_health_rate:percent_1h" in current_gate_expr + assert "platform_quality:test_category_health_rate:percent_1h" in current_gate_expr assert "- 1" in current_gate_expr assert '.*_quality_gate_checks_total' not in current_gate_expr assert "last_over_time" not in current_gate_expr @@ -256,7 +256,7 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability(): assert "tlast_over_time" not in current_gate_expr assert panels_by_title["Current Gate Health (%)"]["gridPos"] == {"h": 4, "w": 3, "x": 0, "y": 0} current_gate_stat_expr = panels_by_title["Current Gate Health (%)"]["targets"][0]["expr"] - assert "platform_quality:test_case_health_rate:percent_1h" in current_gate_stat_expr + assert "platform_quality:test_category_health_rate:percent_1h" in current_gate_stat_expr assert "- 1" not in current_gate_stat_expr assert panels_by_title["Suite Freshness (24h)"]["gridPos"] == {"h": 4, "w": 3, "x": 15, "y": 0} suite_freshness_expr = panels_by_title["Suite Freshness (24h)"]["targets"][0]["expr"] @@ -287,7 +287,8 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability(): assert "category" in category_panel["targets"][0]["expr"] assert category_panel["targets"][0]["format"] == "time_series" assert category_panel["targets"][0]["range"] is True - assert "platform_quality:test_case_health_rate:percent_1h" in category_panel["targets"][0]["expr"] + assert "platform_quality:test_category_health_rate:percent_1h" in category_panel["targets"][0]["expr"] + assert 'test!=""' not in category_panel["targets"][0]["expr"] assert f'category=~"{mod.PLATFORM_TEST_CATEGORY_REGEX}"' in category_panel["targets"][0]["expr"] assert "installer" not in mod.PLATFORM_TEST_CATEGORY_REGEX assert "Use the Suite filter" in category_panel["description"] @@ -305,6 +306,9 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability(): problematic_expr = panels_by_title["Problematic Tests Over Time (Top failures)"]["targets"][0]["expr"] assert "platform_quality:test_case_status:count_1h" in problematic_expr assert "platform_quality_gate_test_case_result" not in problematic_expr + assert "[24h:1h]" in problematic_expr + assert ">= 2" in problematic_expr + assert "@ end()" in problematic_expr coverage_panel = panels_by_title["Coverage History by Suite"] loc_panel = panels_by_title["Files <=500 LOC History by Suite"] @@ -424,7 +428,8 @@ def test_jobs_dashboard_collapses_heavy_drilldowns_for_light_first_paint(): {"color": "dark-orange", "value": 5}, {"color": "dark-red", "value": 8}, ] - assert "hourly bucket" in problematic_panel["description"] + assert "rolling 24h failure count" in problematic_panel["description"] + assert "at least two recent failures" in problematic_panel["description"] sonar_mix_panel = nested_panels_by_title["Sonar Gate Status Mix (Selected)"] sonar_health_panel = nested_panels_by_title["Sonar Gate Health by Project"] diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 46100d71..15dd6474 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -2130,7 +2130,7 @@ }, "targets": [ { - "expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))", + "expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))", "refId": "A", "instant": true } @@ -2819,7 +2819,7 @@ }, "targets": [ { - "expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})", + "expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})", "refId": "A", "legendFormat": "{{category}}", "format": "time_series", diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index ca929df3..0bdfe03f 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(avg((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))", + "expr": "(avg((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))", "refId": "A", "instant": true } @@ -605,7 +605,7 @@ }, "targets": [ { - "expr": "sort(((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0)))) - 1))))", + "expr": "sort(((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0)))) - 1))))", "refId": "A", "legendFormat": "{{suite}}", "instant": true @@ -1056,7 +1056,7 @@ "id": 153, "type": "state-timeline", "title": "Test Category Health History", - "description": "Health by test category from current per-test metrics. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.", + "description": "Health by test category from memoized hourly rollups. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1069,7 +1069,7 @@ }, "targets": [ { - "expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})", + "expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})", "refId": "A", "legendFormat": "{{category}}", "format": "time_series", @@ -2615,7 +2615,7 @@ "id": 145, "type": "state-timeline", "title": "Problematic Tests Over Time (Top failures)", - "description": "Top failing tests inside each hourly bucket. Short current bars can still belong to tests with larger long-window totals.", + "description": "Current outlier tests by rolling 24h failure count. A test needs at least two recent failures to appear, then falls off once it quiets down.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2628,7 +2628,7 @@ }, "targets": [ { - "expr": "topk(12, (sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"})))", + "expr": "(sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[24h:1h]))) and on (suite, test) topk(12, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[24h:1h] @ end()))) >= 2)", "refId": "A", "legendFormat": "{{suite}} - {{test}}", "format": "time_series", diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 2a2a1b8f..648efa66 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -2139,7 +2139,7 @@ data: }, "targets": [ { - "expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))", + "expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))", "refId": "A", "instant": true } @@ -2828,7 +2828,7 @@ data: }, "targets": [ { - "expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})", + "expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})", "refId": "A", "legendFormat": "{{category}}", "format": "time_series", diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 0c5ee8fd..4fd80b7c 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(avg((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))", + "expr": "(avg((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))", "refId": "A", "instant": true } @@ -614,7 +614,7 @@ data: }, "targets": [ { - "expr": "sort(((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0)))) - 1))))", + "expr": "sort(((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0)))) - 1))))", "refId": "A", "legendFormat": "{{suite}}", "instant": true @@ -1065,7 +1065,7 @@ data: "id": 153, "type": "state-timeline", "title": "Test Category Health History", - "description": "Health by test category from current per-test metrics. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.", + "description": "Health by test category from memoized hourly rollups. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1078,7 +1078,7 @@ data: }, "targets": [ { - "expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})", + "expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})", "refId": "A", "legendFormat": "{{category}}", "format": "time_series", @@ -2624,7 +2624,7 @@ data: "id": 145, "type": "state-timeline", "title": "Problematic Tests Over Time (Top failures)", - "description": "Top failing tests inside each hourly bucket. Short current bars can still belong to tests with larger long-window totals.", + "description": "Current outlier tests by rolling 24h failure count. A test needs at least two recent failures to appear, then falls off once it quiets down.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2637,7 +2637,7 @@ data: }, "targets": [ { - "expr": "topk(12, (sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"})))", + "expr": "(sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[24h:1h]))) and on (suite, test) topk(12, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[24h:1h] @ end()))) >= 2)", "refId": "A", "legendFormat": "{{suite}} - {{test}}", "format": "time_series", diff --git a/services/monitoring/vmalert-atlas-availability.yaml b/services/monitoring/vmalert-atlas-availability.yaml index 640ef6d9..a1138d75 100644 --- a/services/monitoring/vmalert-atlas-availability.yaml +++ b/services/monitoring/vmalert-atlas-availability.yaml @@ -227,6 +227,13 @@ data: ) labels: rollup: hourly + - record: platform_quality:test_category_health_rate:percent_1h + expr: | + avg by (suite, branch, category) ( + platform_quality:test_case_health_rate:percent_1h{suite!="",branch!="",test!="",test!="__no_test_cases__",category=~"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit"} + ) + labels: + rollup: hourly - record: platform_quality:suite_runs:increase_24h expr: | sum by (suite, branch, status) ( @@ -391,7 +398,7 @@ spec: labels: app: vmalert-atlas-availability annotations: - bstein.dev/rules-revision: "2026-05-20-platform-quality-fast-run-rollups" + bstein.dev/rules-revision: "2026-05-20-platform-quality-category-rollups" spec: serviceAccountName: vmalert-atlas-availability affinity: