monitoring(testing): backfill category health rollups

2026-05-20 14:37:00 -03:00 · 2026-05-20 14:37:00 -03:00 · 974955ac83
commit 974955ac83
parent 0ea80a8a19
7 changed files with 62 additions and 41 deletions
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -608,6 +608,7 @@ PLATFORM_TEST_SOURCE_LINES_OVER_500_ROLLUP = (
    "platform_quality:suite_source_lines_over_500_total:latest_1h"
 )
 PLATFORM_TEST_SONAR_HEALTH_ROLLUP = "platform_quality:sonar_gate_health_percent:latest_1h"
+PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP = "platform_quality:test_category_health_rate:percent_1h"
 PLATFORM_TEST_SUCCESS_EVENTS_30D = (
    f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")}) or on() vector(0))'
 )
@ -803,10 +804,10 @@ PLATFORM_TEST_CURRENT_GATE_CHECK_HEALTH_BY_SUITE = (
    f'/ clamp_min(sum by (suite) ({PLATFORM_TEST_CHECK_ROLLUP_SEEN_FLAGS}), 1))'
 )
 PLATFORM_TEST_CATEGORY_HEALTH_BY_SUITE = (
-    'min by (suite) (platform_quality:test_case_health_rate:percent_1h{'
+    f'min by (suite) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{'
    f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",branch!="",'
-    f'branch=~"main|master|origin/main|origin/master",test!="",'
-    f'test!="__no_test_cases__",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"'
+    f'branch=~"main|master|origin/main|origin/master",'
+    f'category=~"{PLATFORM_TEST_CATEGORY_REGEX}"'
    "})"
 )
 PLATFORM_TEST_CURRENT_GATE_HEALTH_OBSERVED_BY_SUITE = (
@ -1843,7 +1844,7 @@ TESTING_PANEL_DESCRIPTIONS = {
    "SonarQube Healthy Rate": "Percent of Sonar checks passing or not applicable; higher is better.",
    "Supply Chain Healthy Rate": "Percent of supply-chain checks passing or not applicable; higher is better.",
    "Test Drilldowns And Problem Tests": "Test-case detail for finding which tests are hurting reliability.",
-    "Problematic Tests Over Time (Top failures)": "Top tests failing in each hourly bucket; old totals may only appear in the 30d panel.",
+    "Problematic Tests Over Time (Top failures)": "Current outlier tests by rolling 24h failures; tests need repeat failures to stay visible.",
    "Most Problematic Test by Suite (30d)": "Worst test per suite summed over 30d; high counts can be historical debt.",
    "Selected Test Pass/Fail History": "Hourly pass/fail/skipped volume for the selected test filter.",
    "Selected Test Pass Rate History": "Pass rate history for the selected test filter; higher means the test is stable.",
@ -2217,9 +2218,9 @@ def build_overview():
    }
    overview_avg_coverage = f"(avg(({QUALITY_GATE_COVERAGE_BY_SUITE})) or on() vector(0))"
    overview_category_health = (
-        'avg by (category) (platform_quality:test_case_health_rate:percent_1h{'
+        f'avg by (category) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{'
        f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",branch!="",branch=~"main|master|origin/main|origin/master",'
-        f'test!="",test!="__no_test_cases__",category=~"{PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"'
+        f'category=~"{PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"'
        "})"
    )
    for panel_id, title, draw_expr, runtime_expr, y_pos in [
@ -3972,9 +3973,8 @@ def build_jobs_dashboard():
        f"(100 * ({current_gate_ok_checks}) / clamp_min(({current_gate_seen_checks}), 1))"
    )
    current_category_health_by_suite = (
-        f'min by (suite) (platform_quality:test_case_health_rate:percent_1h{{suite=~"{suite_var}",'
-        f'branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",'
-        f'category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})'
+        f'min by (suite) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{suite=~"{suite_var}",'
+        f'branch!="",branch=~"{branch_var}",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})'
    )
    current_gate_health_observed_by_suite = (
        f"min by (suite) (({current_gate_check_health_by_suite}) "
@ -4064,10 +4064,20 @@ def build_jobs_dashboard():
        state_percent = f"(100 * ({state_checks}) / clamp_min(({total_checks}), 1))"
        return f"(({state_percent}) or on(suite) ({selected_suite_zero}))"

-    rollup_failed_tests = (
-        f'sum by (suite, test) (platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}})'
+    failed_test_status_selector = (
+        f'platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",'
+        f'branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}'
+    )
+    rollup_failed_tests = (
+        f"sum by (suite, test) (sum_over_time({failed_test_status_selector}[24h:1h]))"
+    )
+    current_problem_test_candidates = (
+        f"sum by (suite, test) (sum_over_time({failed_test_status_selector}[24h:1h] @ end()))"
+    )
+    problematic_tests_history_core = (
+        f"({rollup_failed_tests}) "
+        f"and on (suite, test) topk(12, ({current_problem_test_candidates}) >= 2)"
    )
-    problematic_tests_history_core = f"topk(12, ({rollup_failed_tests}))"
    problematic_tests_history = problematic_tests_history_core
    rollup_failed_tests_30d = (
        f'sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}[30d:1h]))'
@ -4106,9 +4116,8 @@ def build_jobs_dashboard():
        f'branch!="",branch=~"{branch_var}",test!="",test=~"{test_var}",test!="__no_test_cases__"}})'
    )
    category_pass_rate_history = (
-        f'avg by (category) (platform_quality:test_case_health_rate:percent_1h{{suite=~"{suite_var}",'
-        f'branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",'
-        f'category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})'
+        f'avg by (category) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{suite=~"{suite_var}",'
+        f'branch!="",branch=~"{branch_var}",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})'
    )
    recent_branch_evidence = (
        f'sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[30d:15m])))'
@ -4467,8 +4476,8 @@ def build_jobs_dashboard():
            max_value=None,
            legend="{{suite}} - {{test}}",
            description=(
-                "Top failing tests inside each hourly bucket. Short current bars can still belong to tests "
-                "with larger long-window totals."
+                "Current outlier tests by rolling 24h failure count. A test needs at least two recent "
+                "failures to appear, then falls off once it quiets down."
            ),
        )
    )
@ -4544,7 +4553,7 @@ def build_jobs_dashboard():
        thresholds=success_thresholds,
        legend="{{category}}",
        description=(
-            "Health by test category from current per-test metrics. Use the Suite filter to focus one "
+            "Health by test category from memoized hourly rollups. Use the Suite filter to focus one "
            "project; skipped tests are healthy, while failures and errors lower the lane."
        ),
    )
--- a/scripts/tests/test_dashboards_render_atlas.py
+++ b/scripts/tests/test_dashboards_render_atlas.py
@ -79,7 +79,7 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
    assert panels_by_title["Test Category Health"]["targets"][0]["range"] is True
    assert "${overview_suite:regex}" not in panels_by_title["Test Category Health"]["targets"][0]["expr"]
    assert mod.PLATFORM_TEST_SUITE_CANONICAL_MATCHER in panels_by_title["Test Category Health"]["targets"][0]["expr"]
-    assert "platform_quality:test_case_health_rate:percent_1h" in panels_by_title["Test Category Health"]["targets"][0]["expr"]
+    assert "platform_quality:test_category_health_rate:percent_1h" in panels_by_title["Test Category Health"]["targets"][0]["expr"]
    assert panels_by_title["Test Category Health"]["timeFrom"] == "24h"
    assert f'category=~"{mod.PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"' in panels_by_title["Test Category Health"]["targets"][0]["expr"]
    assert "manual" not in mod.PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX
@ -127,7 +127,7 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
    assert panels_by_title["Flux Source"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 7}
    assert panels_by_title["Flux Source"]["targets"][0]["legendFormat"] == "{{branch}}"
    assert panels_by_title["Current Gate Health"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 9}
-    assert "platform_quality:test_case_health_rate:percent_1h" in panels_by_title["Current Gate Health"]["targets"][0]["expr"]
+    assert "platform_quality:test_category_health_rate:percent_1h" in panels_by_title["Current Gate Health"]["targets"][0]["expr"]
    assert panels_by_title["CI Run Success (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 11}
    assert panels_by_title["Suites With Runs (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 15}
    suites_reporting_expr = panels_by_title["Suites With Runs (24h)"]["targets"][0]["expr"]
@ -245,7 +245,7 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability():
    current_gate_expr = panels_by_title["Latest Gate Health by Suite"]["targets"][0]["expr"]
    assert 'check)' in current_gate_expr
    assert "platform_quality:check_status:present_1h" in current_gate_expr
-    assert "platform_quality:test_case_health_rate:percent_1h" in current_gate_expr
+    assert "platform_quality:test_category_health_rate:percent_1h" in current_gate_expr
    assert "- 1" in current_gate_expr
    assert '.*_quality_gate_checks_total' not in current_gate_expr
    assert "last_over_time" not in current_gate_expr
@ -256,7 +256,7 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability():
    assert "tlast_over_time" not in current_gate_expr
    assert panels_by_title["Current Gate Health (%)"]["gridPos"] == {"h": 4, "w": 3, "x": 0, "y": 0}
    current_gate_stat_expr = panels_by_title["Current Gate Health (%)"]["targets"][0]["expr"]
-    assert "platform_quality:test_case_health_rate:percent_1h" in current_gate_stat_expr
+    assert "platform_quality:test_category_health_rate:percent_1h" in current_gate_stat_expr
    assert "- 1" not in current_gate_stat_expr
    assert panels_by_title["Suite Freshness (24h)"]["gridPos"] == {"h": 4, "w": 3, "x": 15, "y": 0}
    suite_freshness_expr = panels_by_title["Suite Freshness (24h)"]["targets"][0]["expr"]
@ -287,7 +287,8 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability():
    assert "category" in category_panel["targets"][0]["expr"]
    assert category_panel["targets"][0]["format"] == "time_series"
    assert category_panel["targets"][0]["range"] is True
-    assert "platform_quality:test_case_health_rate:percent_1h" in category_panel["targets"][0]["expr"]
+    assert "platform_quality:test_category_health_rate:percent_1h" in category_panel["targets"][0]["expr"]
+    assert 'test!=""' not in category_panel["targets"][0]["expr"]
    assert f'category=~"{mod.PLATFORM_TEST_CATEGORY_REGEX}"' in category_panel["targets"][0]["expr"]
    assert "installer" not in mod.PLATFORM_TEST_CATEGORY_REGEX
    assert "Use the Suite filter" in category_panel["description"]
@ -305,6 +306,9 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability():
    problematic_expr = panels_by_title["Problematic Tests Over Time (Top failures)"]["targets"][0]["expr"]
    assert "platform_quality:test_case_status:count_1h" in problematic_expr
    assert "platform_quality_gate_test_case_result" not in problematic_expr
+    assert "[24h:1h]" in problematic_expr
+    assert ">= 2" in problematic_expr
+    assert "@ end()" in problematic_expr

    coverage_panel = panels_by_title["Coverage History by Suite"]
    loc_panel = panels_by_title["Files <=500 LOC History by Suite"]
@ -424,7 +428,8 @@ def test_jobs_dashboard_collapses_heavy_drilldowns_for_light_first_paint():
        {"color": "dark-orange", "value": 5},
        {"color": "dark-red", "value": 8},
    ]
-    assert "hourly bucket" in problematic_panel["description"]
+    assert "rolling 24h failure count" in problematic_panel["description"]
+    assert "at least two recent failures" in problematic_panel["description"]

    sonar_mix_panel = nested_panels_by_title["Sonar Gate Status Mix (Selected)"]
    sonar_health_panel = nested_panels_by_title["Sonar Gate Health by Project"]
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@ -2130,7 +2130,7 @@
      },
      "targets": [
        {
-          "expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
+          "expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
          "refId": "A",
          "instant": true
        }
@ -2819,7 +2819,7 @@
      },
      "targets": [
        {
-          "expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
+          "expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
          "refId": "A",
          "legendFormat": "{{category}}",
          "format": "time_series",
--- a/services/monitoring/dashboards/atlas-testing.json
+++ b/services/monitoring/dashboards/atlas-testing.json
@ -20,7 +20,7 @@
      },
      "targets": [
        {
-          "expr": "(avg((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
+          "expr": "(avg((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
          "refId": "A",
          "instant": true
        }
@ -605,7 +605,7 @@
      },
      "targets": [
        {
-          "expr": "sort(((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0)))) - 1))))",
+          "expr": "sort(((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0)))) - 1))))",
          "refId": "A",
          "legendFormat": "{{suite}}",
          "instant": true
@ -1056,7 +1056,7 @@
          "id": 153,
          "type": "state-timeline",
          "title": "Test Category Health History",
-          "description": "Health by test category from current per-test metrics. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.",
+          "description": "Health by test category from memoized hourly rollups. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1069,7 +1069,7 @@
          },
          "targets": [
            {
-              "expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})",
+              "expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})",
              "refId": "A",
              "legendFormat": "{{category}}",
              "format": "time_series",
@ -2615,7 +2615,7 @@
          "id": 145,
          "type": "state-timeline",
          "title": "Problematic Tests Over Time (Top failures)",
-          "description": "Top failing tests inside each hourly bucket. Short current bars can still belong to tests with larger long-window totals.",
+          "description": "Current outlier tests by rolling 24h failure count. A test needs at least two recent failures to appear, then falls off once it quiets down.",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -2628,7 +2628,7 @@
          },
          "targets": [
            {
-              "expr": "topk(12, (sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"})))",
+              "expr": "(sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[24h:1h]))) and on (suite, test) topk(12, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[24h:1h] @ end()))) >= 2)",
              "refId": "A",
              "legendFormat": "{{suite}} - {{test}}",
              "format": "time_series",
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@ -2139,7 +2139,7 @@ data:
          },
          "targets": [
            {
-              "expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
+              "expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
              "refId": "A",
              "instant": true
            }
@ -2828,7 +2828,7 @@ data:
          },
          "targets": [
            {
-              "expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
+              "expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
              "refId": "A",
              "legendFormat": "{{category}}",
              "format": "time_series",
--- a/services/monitoring/grafana-dashboard-testing.yaml
+++ b/services/monitoring/grafana-dashboard-testing.yaml
@ -29,7 +29,7 @@ data:
          },
          "targets": [
            {
-              "expr": "(avg((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
+              "expr": "(avg((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
              "refId": "A",
              "instant": true
            }
@ -614,7 +614,7 @@ data:
          },
          "targets": [
            {
-              "expr": "sort(((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0)))) - 1))))",
+              "expr": "sort(((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0)))) - 1))))",
              "refId": "A",
              "legendFormat": "{{suite}}",
              "instant": true
@ -1065,7 +1065,7 @@ data:
              "id": 153,
              "type": "state-timeline",
              "title": "Test Category Health History",
-              "description": "Health by test category from current per-test metrics. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.",
+              "description": "Health by test category from memoized hourly rollups. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.",
              "datasource": {
                "type": "prometheus",
                "uid": "atlas-vm"
@ -1078,7 +1078,7 @@ data:
              },
              "targets": [
                {
-                  "expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})",
+                  "expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})",
                  "refId": "A",
                  "legendFormat": "{{category}}",
                  "format": "time_series",
@ -2624,7 +2624,7 @@ data:
              "id": 145,
              "type": "state-timeline",
              "title": "Problematic Tests Over Time (Top failures)",
-              "description": "Top failing tests inside each hourly bucket. Short current bars can still belong to tests with larger long-window totals.",
+              "description": "Current outlier tests by rolling 24h failure count. A test needs at least two recent failures to appear, then falls off once it quiets down.",
              "datasource": {
                "type": "prometheus",
                "uid": "atlas-vm"
@ -2637,7 +2637,7 @@ data:
              },
              "targets": [
                {
-                  "expr": "topk(12, (sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"})))",
+                  "expr": "(sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[24h:1h]))) and on (suite, test) topk(12, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[24h:1h] @ end()))) >= 2)",
                  "refId": "A",
                  "legendFormat": "{{suite}} - {{test}}",
                  "format": "time_series",
--- a/services/monitoring/vmalert-atlas-availability.yaml
+++ b/services/monitoring/vmalert-atlas-availability.yaml
@ -227,6 +227,13 @@ data:
              )
            labels:
              rollup: hourly
+          - record: platform_quality:test_category_health_rate:percent_1h
+            expr: |
+              avg by (suite, branch, category) (
+                platform_quality:test_case_health_rate:percent_1h{suite!="",branch!="",test!="",test!="__no_test_cases__",category=~"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit"}
+              )
+            labels:
+              rollup: hourly
          - record: platform_quality:suite_runs:increase_24h
            expr: |
              sum by (suite, branch, status) (
@ -391,7 +398,7 @@ spec:
      labels:
        app: vmalert-atlas-availability
      annotations:
-        bstein.dev/rules-revision: "2026-05-20-platform-quality-fast-run-rollups"
+        bstein.dev/rules-revision: "2026-05-20-platform-quality-category-rollups"
    spec:
      serviceAccountName: vmalert-atlas-availability
      affinity: