monitoring(testing): derive gate health from raw checks

This commit is contained in:
jenkins 2026-05-19 03:59:55 -03:00
parent 813d057c6d
commit ba9b72312a
4 changed files with 102 additions and 44 deletions

View File

@ -627,6 +627,49 @@ PLATFORM_TEST_ACTIVE_SUITES_24H = (
PLATFORM_TEST_POINT_WINDOW = "1h"
PLATFORM_TEST_FRESH_WINDOW = "30h"
PLATFORM_TEST_LATEST_WINDOW = "30d"
def platform_check_status_expr(
suite_matcher: str,
*,
branch_matcher: str = 'branch!=""',
check_matcher: str = 'check!=""',
status_matcher: str = 'status!=""',
window: str = PLATFORM_TEST_FRESH_WINDOW,
) -> str:
"""Return recent check gauges normalized to a status label."""
result_matcher = status_matcher.replace("status", "result", 1)
base = (
f'__name__=~".*_quality_gate_checks_total",{suite_matcher},'
f'{PLATFORM_TEST_EXPORT_FILTER},{check_matcher}'
)
build_info = (
f'max by (suite, branch) (last_over_time(platform_quality_gate_build_info'
f'{{{suite_matcher},{PLATFORM_TEST_EXPORT_FILTER},{branch_matcher}}}[{window}]))'
)
with_status = (
f'last_over_time({{{base},{branch_matcher},{status_matcher}}}[{window}])'
)
with_result = (
f'label_replace(last_over_time({{{base},{branch_matcher},{result_matcher},status=""}}'
f'[{window}]), "status", "$1", "result", "(.*)")'
)
inferred_status = (
f'(last_over_time({{{base},branch="",{status_matcher}}}[{window}]) '
f'* on (suite) group_left(branch) ({build_info}))'
)
inferred_result = (
f'(label_replace(last_over_time({{{base},branch="",{result_matcher},status=""}}'
f'[{window}]), "status", "$1", "result", "(.*)") '
f'* on (suite) group_left(branch) ({build_info}))'
)
return (
f"sum by (suite, branch, check, status) ("
f"{with_status} or {with_result} or {inferred_status} or {inferred_result}"
f")"
)
PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS = [
{
"refId": chr(ord("A") + index),
@ -679,18 +722,26 @@ PLATFORM_TEST_CHECKS_SELECTOR = (
f"{PLATFORM_TEST_EXPORT_FILTER}"
)
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS = (
f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",branch!="",check!=""'
f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}"'
)
# This recording rule already folds recent scrape freshness into the current sample.
# Do not wrap it in a multi-hour range for "latest" panels, or old failures linger.
PLATFORM_TEST_CHECK_ROLLUP_SELECTOR = (
f'platform_quality:check_status:present_1h{{{PLATFORM_TEST_CHECK_ROLLUP_MATCHERS},status!=""}}'
PLATFORM_TEST_PRIMARY_BRANCH_MATCHER = 'branch=~"main|master|origin/main|origin/master"'
PLATFORM_TEST_CHECK_ROLLUP_SELECTOR = platform_check_status_expr(
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS,
branch_matcher=PLATFORM_TEST_PRIMARY_BRANCH_MATCHER,
)
PLATFORM_TEST_CHECK_ROLLUP_OK_SELECTOR = (
f'platform_quality:check_status:present_1h{{{PLATFORM_TEST_CHECK_ROLLUP_MATCHERS},status=~"{PLATFORM_TEST_NON_FAILURE_STATUS}"}}'
platform_check_status_expr(
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS,
branch_matcher=PLATFORM_TEST_PRIMARY_BRANCH_MATCHER,
status_matcher=f'status=~"{PLATFORM_TEST_NON_FAILURE_STATUS}"',
)
)
PLATFORM_TEST_CHECK_ROLLUP_FAILED_SELECTOR = (
f'platform_quality:check_status:present_1h{{{PLATFORM_TEST_CHECK_ROLLUP_MATCHERS},status!~"{PLATFORM_TEST_NON_FAILURE_STATUS}"}}'
platform_check_status_expr(
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS,
branch_matcher=PLATFORM_TEST_PRIMARY_BRANCH_MATCHER,
status_matcher=f'status!~"{PLATFORM_TEST_NON_FAILURE_STATUS}"',
)
)
PLATFORM_TEST_CHECK_ROLLUP_SEEN_FLAGS = (
f'clamp_max(max by (suite, check) (({PLATFORM_TEST_CHECK_ROLLUP_SELECTOR}) > 0), 1)'
@ -3776,9 +3827,11 @@ def build_jobs_dashboard():
check_matcher: str = 'check!=""',
status_matcher: str = 'status!=""',
) -> str:
selector = (
f'platform_quality:check_status:present_1h{{suite=~"{suite_var}",branch!="",'
f'branch=~"{branch_var}",{check_matcher},{status_matcher}}}'
selector = platform_check_status_expr(
f'suite=~"{suite_var}"',
branch_matcher=f'branch!="",branch=~"{branch_var}"',
check_matcher=check_matcher,
status_matcher=status_matcher,
)
return selector

View File

@ -227,8 +227,11 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability():
current_gate_expr = panels_by_title["Latest Gate Checks Passing by Suite"]["targets"][0]["expr"]
assert 'check)' in current_gate_expr
assert "platform_quality:check_status:present_1h" in current_gate_expr
assert "platform_quality_gate_checks_total" not in current_gate_expr
assert "platform_quality:check_status:present_1h" not in current_gate_expr
assert '.*_quality_gate_checks_total' in current_gate_expr
assert "last_over_time" in current_gate_expr
assert 'label_replace' in current_gate_expr
assert 'result=~"ok|passed|success|not_applicable|skipped|na|n/a"' in current_gate_expr
assert 'status=~"ok|passed|success|not_applicable|skipped|na|n/a"' in current_gate_expr
assert 'status!~"ok|passed|success|not_applicable|skipped|na|n/a"' in current_gate_expr
assert "unless on(suite, check)" in current_gate_expr
@ -337,8 +340,10 @@ def test_jobs_dashboard_collapses_heavy_drilldowns_for_light_first_paint():
assert failure_rate_panel["fieldConfig"]["defaults"]["max"] == 100
assert failure_rate_panel["fieldConfig"]["defaults"]["thresholds"]["steps"][0]["color"] == "dark-blue"
assert "increase(" not in failure_rate_panel["targets"][0]["expr"]
assert "platform_quality:check_status:present_1h" in failure_rate_panel["targets"][0]["expr"]
assert "platform_quality_gate_checks_total" not in failure_rate_panel["targets"][0]["expr"]
assert "platform_quality:check_status:present_1h" not in failure_rate_panel["targets"][0]["expr"]
assert '.*_quality_gate_checks_total' in failure_rate_panel["targets"][0]["expr"]
assert "last_over_time" in failure_rate_panel["targets"][0]["expr"]
assert 'label_replace' in failure_rate_panel["targets"][0]["expr"]
assert "0 *" in failure_rate_panel["targets"][0]["expr"]
assert "and on(suite)" not in failure_rate_panel["targets"][0]["expr"]

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long