monitoring(testing): backfill category health rollups

This commit is contained in:
jenkins 2026-05-20 14:37:00 -03:00
parent 0ea80a8a19
commit 974955ac83
7 changed files with 62 additions and 41 deletions

View File

@ -608,6 +608,7 @@ PLATFORM_TEST_SOURCE_LINES_OVER_500_ROLLUP = (
"platform_quality:suite_source_lines_over_500_total:latest_1h"
)
PLATFORM_TEST_SONAR_HEALTH_ROLLUP = "platform_quality:sonar_gate_health_percent:latest_1h"
PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP = "platform_quality:test_category_health_rate:percent_1h"
PLATFORM_TEST_SUCCESS_EVENTS_30D = (
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")}) or on() vector(0))'
)
@ -803,10 +804,10 @@ PLATFORM_TEST_CURRENT_GATE_CHECK_HEALTH_BY_SUITE = (
f'/ clamp_min(sum by (suite) ({PLATFORM_TEST_CHECK_ROLLUP_SEEN_FLAGS}), 1))'
)
PLATFORM_TEST_CATEGORY_HEALTH_BY_SUITE = (
'min by (suite) (platform_quality:test_case_health_rate:percent_1h{'
f'min by (suite) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{'
f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",branch!="",'
f'branch=~"main|master|origin/main|origin/master",test!="",'
f'test!="__no_test_cases__",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"'
f'branch=~"main|master|origin/main|origin/master",'
f'category=~"{PLATFORM_TEST_CATEGORY_REGEX}"'
"})"
)
PLATFORM_TEST_CURRENT_GATE_HEALTH_OBSERVED_BY_SUITE = (
@ -1843,7 +1844,7 @@ TESTING_PANEL_DESCRIPTIONS = {
"SonarQube Healthy Rate": "Percent of Sonar checks passing or not applicable; higher is better.",
"Supply Chain Healthy Rate": "Percent of supply-chain checks passing or not applicable; higher is better.",
"Test Drilldowns And Problem Tests": "Test-case detail for finding which tests are hurting reliability.",
"Problematic Tests Over Time (Top failures)": "Top tests failing in each hourly bucket; old totals may only appear in the 30d panel.",
"Problematic Tests Over Time (Top failures)": "Current outlier tests by rolling 24h failures; tests need repeat failures to stay visible.",
"Most Problematic Test by Suite (30d)": "Worst test per suite summed over 30d; high counts can be historical debt.",
"Selected Test Pass/Fail History": "Hourly pass/fail/skipped volume for the selected test filter.",
"Selected Test Pass Rate History": "Pass rate history for the selected test filter; higher means the test is stable.",
@ -2217,9 +2218,9 @@ def build_overview():
}
overview_avg_coverage = f"(avg(({QUALITY_GATE_COVERAGE_BY_SUITE})) or on() vector(0))"
overview_category_health = (
'avg by (category) (platform_quality:test_case_health_rate:percent_1h{'
f'avg by (category) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{'
f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",branch!="",branch=~"main|master|origin/main|origin/master",'
f'test!="",test!="__no_test_cases__",category=~"{PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"'
f'category=~"{PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"'
"})"
)
for panel_id, title, draw_expr, runtime_expr, y_pos in [
@ -3972,9 +3973,8 @@ def build_jobs_dashboard():
f"(100 * ({current_gate_ok_checks}) / clamp_min(({current_gate_seen_checks}), 1))"
)
current_category_health_by_suite = (
f'min by (suite) (platform_quality:test_case_health_rate:percent_1h{{suite=~"{suite_var}",'
f'branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",'
f'category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})'
f'min by (suite) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{suite=~"{suite_var}",'
f'branch!="",branch=~"{branch_var}",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})'
)
current_gate_health_observed_by_suite = (
f"min by (suite) (({current_gate_check_health_by_suite}) "
@ -4064,10 +4064,20 @@ def build_jobs_dashboard():
state_percent = f"(100 * ({state_checks}) / clamp_min(({total_checks}), 1))"
return f"(({state_percent}) or on(suite) ({selected_suite_zero}))"
rollup_failed_tests = (
f'sum by (suite, test) (platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}})'
failed_test_status_selector = (
f'platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",'
f'branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}'
)
rollup_failed_tests = (
f"sum by (suite, test) (sum_over_time({failed_test_status_selector}[24h:1h]))"
)
current_problem_test_candidates = (
f"sum by (suite, test) (sum_over_time({failed_test_status_selector}[24h:1h] @ end()))"
)
problematic_tests_history_core = (
f"({rollup_failed_tests}) "
f"and on (suite, test) topk(12, ({current_problem_test_candidates}) >= 2)"
)
problematic_tests_history_core = f"topk(12, ({rollup_failed_tests}))"
problematic_tests_history = problematic_tests_history_core
rollup_failed_tests_30d = (
f'sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}[30d:1h]))'
@ -4106,9 +4116,8 @@ def build_jobs_dashboard():
f'branch!="",branch=~"{branch_var}",test!="",test=~"{test_var}",test!="__no_test_cases__"}})'
)
category_pass_rate_history = (
f'avg by (category) (platform_quality:test_case_health_rate:percent_1h{{suite=~"{suite_var}",'
f'branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",'
f'category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})'
f'avg by (category) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{suite=~"{suite_var}",'
f'branch!="",branch=~"{branch_var}",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})'
)
recent_branch_evidence = (
f'sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[30d:15m])))'
@ -4467,8 +4476,8 @@ def build_jobs_dashboard():
max_value=None,
legend="{{suite}} - {{test}}",
description=(
"Top failing tests inside each hourly bucket. Short current bars can still belong to tests "
"with larger long-window totals."
"Current outlier tests by rolling 24h failure count. A test needs at least two recent "
"failures to appear, then falls off once it quiets down."
),
)
)
@ -4544,7 +4553,7 @@ def build_jobs_dashboard():
thresholds=success_thresholds,
legend="{{category}}",
description=(
"Health by test category from current per-test metrics. Use the Suite filter to focus one "
"Health by test category from memoized hourly rollups. Use the Suite filter to focus one "
"project; skipped tests are healthy, while failures and errors lower the lane."
),
)

View File

@ -79,7 +79,7 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
assert panels_by_title["Test Category Health"]["targets"][0]["range"] is True
assert "${overview_suite:regex}" not in panels_by_title["Test Category Health"]["targets"][0]["expr"]
assert mod.PLATFORM_TEST_SUITE_CANONICAL_MATCHER in panels_by_title["Test Category Health"]["targets"][0]["expr"]
assert "platform_quality:test_case_health_rate:percent_1h" in panels_by_title["Test Category Health"]["targets"][0]["expr"]
assert "platform_quality:test_category_health_rate:percent_1h" in panels_by_title["Test Category Health"]["targets"][0]["expr"]
assert panels_by_title["Test Category Health"]["timeFrom"] == "24h"
assert f'category=~"{mod.PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"' in panels_by_title["Test Category Health"]["targets"][0]["expr"]
assert "manual" not in mod.PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX
@ -127,7 +127,7 @@ def test_overview_uses_readable_quality_power_and_gitops_panels():
assert panels_by_title["Flux Source"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 7}
assert panels_by_title["Flux Source"]["targets"][0]["legendFormat"] == "{{branch}}"
assert panels_by_title["Current Gate Health"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 9}
assert "platform_quality:test_case_health_rate:percent_1h" in panels_by_title["Current Gate Health"]["targets"][0]["expr"]
assert "platform_quality:test_category_health_rate:percent_1h" in panels_by_title["Current Gate Health"]["targets"][0]["expr"]
assert panels_by_title["CI Run Success (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 11}
assert panels_by_title["Suites With Runs (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 15}
suites_reporting_expr = panels_by_title["Suites With Runs (24h)"]["targets"][0]["expr"]
@ -245,7 +245,7 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability():
current_gate_expr = panels_by_title["Latest Gate Health by Suite"]["targets"][0]["expr"]
assert 'check)' in current_gate_expr
assert "platform_quality:check_status:present_1h" in current_gate_expr
assert "platform_quality:test_case_health_rate:percent_1h" in current_gate_expr
assert "platform_quality:test_category_health_rate:percent_1h" in current_gate_expr
assert "- 1" in current_gate_expr
assert '.*_quality_gate_checks_total' not in current_gate_expr
assert "last_over_time" not in current_gate_expr
@ -256,7 +256,7 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability():
assert "tlast_over_time" not in current_gate_expr
assert panels_by_title["Current Gate Health (%)"]["gridPos"] == {"h": 4, "w": 3, "x": 0, "y": 0}
current_gate_stat_expr = panels_by_title["Current Gate Health (%)"]["targets"][0]["expr"]
assert "platform_quality:test_case_health_rate:percent_1h" in current_gate_stat_expr
assert "platform_quality:test_category_health_rate:percent_1h" in current_gate_stat_expr
assert "- 1" not in current_gate_stat_expr
assert panels_by_title["Suite Freshness (24h)"]["gridPos"] == {"h": 4, "w": 3, "x": 15, "y": 0}
suite_freshness_expr = panels_by_title["Suite Freshness (24h)"]["targets"][0]["expr"]
@ -287,7 +287,8 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability():
assert "category" in category_panel["targets"][0]["expr"]
assert category_panel["targets"][0]["format"] == "time_series"
assert category_panel["targets"][0]["range"] is True
assert "platform_quality:test_case_health_rate:percent_1h" in category_panel["targets"][0]["expr"]
assert "platform_quality:test_category_health_rate:percent_1h" in category_panel["targets"][0]["expr"]
assert 'test!=""' not in category_panel["targets"][0]["expr"]
assert f'category=~"{mod.PLATFORM_TEST_CATEGORY_REGEX}"' in category_panel["targets"][0]["expr"]
assert "installer" not in mod.PLATFORM_TEST_CATEGORY_REGEX
assert "Use the Suite filter" in category_panel["description"]
@ -305,6 +306,9 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability():
problematic_expr = panels_by_title["Problematic Tests Over Time (Top failures)"]["targets"][0]["expr"]
assert "platform_quality:test_case_status:count_1h" in problematic_expr
assert "platform_quality_gate_test_case_result" not in problematic_expr
assert "[24h:1h]" in problematic_expr
assert ">= 2" in problematic_expr
assert "@ end()" in problematic_expr
coverage_panel = panels_by_title["Coverage History by Suite"]
loc_panel = panels_by_title["Files <=500 LOC History by Suite"]
@ -424,7 +428,8 @@ def test_jobs_dashboard_collapses_heavy_drilldowns_for_light_first_paint():
{"color": "dark-orange", "value": 5},
{"color": "dark-red", "value": 8},
]
assert "hourly bucket" in problematic_panel["description"]
assert "rolling 24h failure count" in problematic_panel["description"]
assert "at least two recent failures" in problematic_panel["description"]
sonar_mix_panel = nested_panels_by_title["Sonar Gate Status Mix (Selected)"]
sonar_health_panel = nested_panels_by_title["Sonar Gate Health by Project"]

View File

@ -2130,7 +2130,7 @@
},
"targets": [
{
"expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"refId": "A",
"instant": true
}
@ -2819,7 +2819,7 @@
},
"targets": [
{
"expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
"expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
"refId": "A",
"legendFormat": "{{category}}",
"format": "time_series",

View File

@ -20,7 +20,7 @@
},
"targets": [
{
"expr": "(avg((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"expr": "(avg((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"refId": "A",
"instant": true
}
@ -605,7 +605,7 @@
},
"targets": [
{
"expr": "sort(((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0)))) - 1))))",
"expr": "sort(((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0)))) - 1))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -1056,7 +1056,7 @@
"id": 153,
"type": "state-timeline",
"title": "Test Category Health History",
"description": "Health by test category from current per-test metrics. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.",
"description": "Health by test category from memoized hourly rollups. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1069,7 +1069,7 @@
},
"targets": [
{
"expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})",
"expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})",
"refId": "A",
"legendFormat": "{{category}}",
"format": "time_series",
@ -2615,7 +2615,7 @@
"id": 145,
"type": "state-timeline",
"title": "Problematic Tests Over Time (Top failures)",
"description": "Top failing tests inside each hourly bucket. Short current bars can still belong to tests with larger long-window totals.",
"description": "Current outlier tests by rolling 24h failure count. A test needs at least two recent failures to appear, then falls off once it quiets down.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -2628,7 +2628,7 @@
},
"targets": [
{
"expr": "topk(12, (sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"})))",
"expr": "(sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[24h:1h]))) and on (suite, test) topk(12, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[24h:1h] @ end()))) >= 2)",
"refId": "A",
"legendFormat": "{{suite}} - {{test}}",
"format": "time_series",

View File

@ -2139,7 +2139,7 @@ data:
},
"targets": [
{
"expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"refId": "A",
"instant": true
}
@ -2828,7 +2828,7 @@ data:
},
"targets": [
{
"expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
"expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
"refId": "A",
"legendFormat": "{{category}}",
"format": "time_series",

View File

@ -29,7 +29,7 @@ data:
},
"targets": [
{
"expr": "(avg((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"expr": "(avg((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"refId": "A",
"instant": true
}
@ -614,7 +614,7 @@ data:
},
"targets": [
{
"expr": "sort(((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0)))) - 1))))",
"expr": "sort(((min by (suite) (((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0)))) - 1))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -1065,7 +1065,7 @@ data:
"id": 153,
"type": "state-timeline",
"title": "Test Category Health History",
"description": "Health by test category from current per-test metrics. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.",
"description": "Health by test category from memoized hourly rollups. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1078,7 +1078,7 @@ data:
},
"targets": [
{
"expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})",
"expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})",
"refId": "A",
"legendFormat": "{{category}}",
"format": "time_series",
@ -2624,7 +2624,7 @@ data:
"id": 145,
"type": "state-timeline",
"title": "Problematic Tests Over Time (Top failures)",
"description": "Top failing tests inside each hourly bucket. Short current bars can still belong to tests with larger long-window totals.",
"description": "Current outlier tests by rolling 24h failure count. A test needs at least two recent failures to appear, then falls off once it quiets down.",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -2637,7 +2637,7 @@ data:
},
"targets": [
{
"expr": "topk(12, (sum by (suite, test) (platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"})))",
"expr": "(sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[24h:1h]))) and on (suite, test) topk(12, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[24h:1h] @ end()))) >= 2)",
"refId": "A",
"legendFormat": "{{suite}} - {{test}}",
"format": "time_series",

View File

@ -227,6 +227,13 @@ data:
)
labels:
rollup: hourly
- record: platform_quality:test_category_health_rate:percent_1h
expr: |
avg by (suite, branch, category) (
platform_quality:test_case_health_rate:percent_1h{suite!="",branch!="",test!="",test!="__no_test_cases__",category=~"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit"}
)
labels:
rollup: hourly
- record: platform_quality:suite_runs:increase_24h
expr: |
sum by (suite, branch, status) (
@ -391,7 +398,7 @@ spec:
labels:
app: vmalert-atlas-availability
annotations:
bstein.dev/rules-revision: "2026-05-20-platform-quality-fast-run-rollups"
bstein.dev/rules-revision: "2026-05-20-platform-quality-category-rollups"
spec:
serviceAccountName: vmalert-atlas-availability
affinity: