monitoring(testing): derive gate health from raw checks
This commit is contained in:
parent
813d057c6d
commit
ba9b72312a
@ -627,6 +627,49 @@ PLATFORM_TEST_ACTIVE_SUITES_24H = (
|
|||||||
PLATFORM_TEST_POINT_WINDOW = "1h"
|
PLATFORM_TEST_POINT_WINDOW = "1h"
|
||||||
PLATFORM_TEST_FRESH_WINDOW = "30h"
|
PLATFORM_TEST_FRESH_WINDOW = "30h"
|
||||||
PLATFORM_TEST_LATEST_WINDOW = "30d"
|
PLATFORM_TEST_LATEST_WINDOW = "30d"
|
||||||
|
|
||||||
|
|
||||||
|
def platform_check_status_expr(
|
||||||
|
suite_matcher: str,
|
||||||
|
*,
|
||||||
|
branch_matcher: str = 'branch!=""',
|
||||||
|
check_matcher: str = 'check!=""',
|
||||||
|
status_matcher: str = 'status!=""',
|
||||||
|
window: str = PLATFORM_TEST_FRESH_WINDOW,
|
||||||
|
) -> str:
|
||||||
|
"""Return recent check gauges normalized to a status label."""
|
||||||
|
result_matcher = status_matcher.replace("status", "result", 1)
|
||||||
|
base = (
|
||||||
|
f'__name__=~".*_quality_gate_checks_total",{suite_matcher},'
|
||||||
|
f'{PLATFORM_TEST_EXPORT_FILTER},{check_matcher}'
|
||||||
|
)
|
||||||
|
build_info = (
|
||||||
|
f'max by (suite, branch) (last_over_time(platform_quality_gate_build_info'
|
||||||
|
f'{{{suite_matcher},{PLATFORM_TEST_EXPORT_FILTER},{branch_matcher}}}[{window}]))'
|
||||||
|
)
|
||||||
|
with_status = (
|
||||||
|
f'last_over_time({{{base},{branch_matcher},{status_matcher}}}[{window}])'
|
||||||
|
)
|
||||||
|
with_result = (
|
||||||
|
f'label_replace(last_over_time({{{base},{branch_matcher},{result_matcher},status=""}}'
|
||||||
|
f'[{window}]), "status", "$1", "result", "(.*)")'
|
||||||
|
)
|
||||||
|
inferred_status = (
|
||||||
|
f'(last_over_time({{{base},branch="",{status_matcher}}}[{window}]) '
|
||||||
|
f'* on (suite) group_left(branch) ({build_info}))'
|
||||||
|
)
|
||||||
|
inferred_result = (
|
||||||
|
f'(label_replace(last_over_time({{{base},branch="",{result_matcher},status=""}}'
|
||||||
|
f'[{window}]), "status", "$1", "result", "(.*)") '
|
||||||
|
f'* on (suite) group_left(branch) ({build_info}))'
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
f"sum by (suite, branch, check, status) ("
|
||||||
|
f"{with_status} or {with_result} or {inferred_status} or {inferred_result}"
|
||||||
|
f")"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS = [
|
PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS = [
|
||||||
{
|
{
|
||||||
"refId": chr(ord("A") + index),
|
"refId": chr(ord("A") + index),
|
||||||
@ -679,18 +722,26 @@ PLATFORM_TEST_CHECKS_SELECTOR = (
|
|||||||
f"{PLATFORM_TEST_EXPORT_FILTER}"
|
f"{PLATFORM_TEST_EXPORT_FILTER}"
|
||||||
)
|
)
|
||||||
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS = (
|
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS = (
|
||||||
f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",branch!="",check!=""'
|
f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}"'
|
||||||
)
|
)
|
||||||
# This recording rule already folds recent scrape freshness into the current sample.
|
PLATFORM_TEST_PRIMARY_BRANCH_MATCHER = 'branch=~"main|master|origin/main|origin/master"'
|
||||||
# Do not wrap it in a multi-hour range for "latest" panels, or old failures linger.
|
PLATFORM_TEST_CHECK_ROLLUP_SELECTOR = platform_check_status_expr(
|
||||||
PLATFORM_TEST_CHECK_ROLLUP_SELECTOR = (
|
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS,
|
||||||
f'platform_quality:check_status:present_1h{{{PLATFORM_TEST_CHECK_ROLLUP_MATCHERS},status!=""}}'
|
branch_matcher=PLATFORM_TEST_PRIMARY_BRANCH_MATCHER,
|
||||||
)
|
)
|
||||||
PLATFORM_TEST_CHECK_ROLLUP_OK_SELECTOR = (
|
PLATFORM_TEST_CHECK_ROLLUP_OK_SELECTOR = (
|
||||||
f'platform_quality:check_status:present_1h{{{PLATFORM_TEST_CHECK_ROLLUP_MATCHERS},status=~"{PLATFORM_TEST_NON_FAILURE_STATUS}"}}'
|
platform_check_status_expr(
|
||||||
|
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS,
|
||||||
|
branch_matcher=PLATFORM_TEST_PRIMARY_BRANCH_MATCHER,
|
||||||
|
status_matcher=f'status=~"{PLATFORM_TEST_NON_FAILURE_STATUS}"',
|
||||||
|
)
|
||||||
)
|
)
|
||||||
PLATFORM_TEST_CHECK_ROLLUP_FAILED_SELECTOR = (
|
PLATFORM_TEST_CHECK_ROLLUP_FAILED_SELECTOR = (
|
||||||
f'platform_quality:check_status:present_1h{{{PLATFORM_TEST_CHECK_ROLLUP_MATCHERS},status!~"{PLATFORM_TEST_NON_FAILURE_STATUS}"}}'
|
platform_check_status_expr(
|
||||||
|
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS,
|
||||||
|
branch_matcher=PLATFORM_TEST_PRIMARY_BRANCH_MATCHER,
|
||||||
|
status_matcher=f'status!~"{PLATFORM_TEST_NON_FAILURE_STATUS}"',
|
||||||
|
)
|
||||||
)
|
)
|
||||||
PLATFORM_TEST_CHECK_ROLLUP_SEEN_FLAGS = (
|
PLATFORM_TEST_CHECK_ROLLUP_SEEN_FLAGS = (
|
||||||
f'clamp_max(max by (suite, check) (({PLATFORM_TEST_CHECK_ROLLUP_SELECTOR}) > 0), 1)'
|
f'clamp_max(max by (suite, check) (({PLATFORM_TEST_CHECK_ROLLUP_SELECTOR}) > 0), 1)'
|
||||||
@ -3776,9 +3827,11 @@ def build_jobs_dashboard():
|
|||||||
check_matcher: str = 'check!=""',
|
check_matcher: str = 'check!=""',
|
||||||
status_matcher: str = 'status!=""',
|
status_matcher: str = 'status!=""',
|
||||||
) -> str:
|
) -> str:
|
||||||
selector = (
|
selector = platform_check_status_expr(
|
||||||
f'platform_quality:check_status:present_1h{{suite=~"{suite_var}",branch!="",'
|
f'suite=~"{suite_var}"',
|
||||||
f'branch=~"{branch_var}",{check_matcher},{status_matcher}}}'
|
branch_matcher=f'branch!="",branch=~"{branch_var}"',
|
||||||
|
check_matcher=check_matcher,
|
||||||
|
status_matcher=status_matcher,
|
||||||
)
|
)
|
||||||
return selector
|
return selector
|
||||||
|
|
||||||
|
|||||||
@ -227,8 +227,11 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability():
|
|||||||
|
|
||||||
current_gate_expr = panels_by_title["Latest Gate Checks Passing by Suite"]["targets"][0]["expr"]
|
current_gate_expr = panels_by_title["Latest Gate Checks Passing by Suite"]["targets"][0]["expr"]
|
||||||
assert 'check)' in current_gate_expr
|
assert 'check)' in current_gate_expr
|
||||||
assert "platform_quality:check_status:present_1h" in current_gate_expr
|
assert "platform_quality:check_status:present_1h" not in current_gate_expr
|
||||||
assert "platform_quality_gate_checks_total" not in current_gate_expr
|
assert '.*_quality_gate_checks_total' in current_gate_expr
|
||||||
|
assert "last_over_time" in current_gate_expr
|
||||||
|
assert 'label_replace' in current_gate_expr
|
||||||
|
assert 'result=~"ok|passed|success|not_applicable|skipped|na|n/a"' in current_gate_expr
|
||||||
assert 'status=~"ok|passed|success|not_applicable|skipped|na|n/a"' in current_gate_expr
|
assert 'status=~"ok|passed|success|not_applicable|skipped|na|n/a"' in current_gate_expr
|
||||||
assert 'status!~"ok|passed|success|not_applicable|skipped|na|n/a"' in current_gate_expr
|
assert 'status!~"ok|passed|success|not_applicable|skipped|na|n/a"' in current_gate_expr
|
||||||
assert "unless on(suite, check)" in current_gate_expr
|
assert "unless on(suite, check)" in current_gate_expr
|
||||||
@ -337,8 +340,10 @@ def test_jobs_dashboard_collapses_heavy_drilldowns_for_light_first_paint():
|
|||||||
assert failure_rate_panel["fieldConfig"]["defaults"]["max"] == 100
|
assert failure_rate_panel["fieldConfig"]["defaults"]["max"] == 100
|
||||||
assert failure_rate_panel["fieldConfig"]["defaults"]["thresholds"]["steps"][0]["color"] == "dark-blue"
|
assert failure_rate_panel["fieldConfig"]["defaults"]["thresholds"]["steps"][0]["color"] == "dark-blue"
|
||||||
assert "increase(" not in failure_rate_panel["targets"][0]["expr"]
|
assert "increase(" not in failure_rate_panel["targets"][0]["expr"]
|
||||||
assert "platform_quality:check_status:present_1h" in failure_rate_panel["targets"][0]["expr"]
|
assert "platform_quality:check_status:present_1h" not in failure_rate_panel["targets"][0]["expr"]
|
||||||
assert "platform_quality_gate_checks_total" not in failure_rate_panel["targets"][0]["expr"]
|
assert '.*_quality_gate_checks_total' in failure_rate_panel["targets"][0]["expr"]
|
||||||
|
assert "last_over_time" in failure_rate_panel["targets"][0]["expr"]
|
||||||
|
assert 'label_replace' in failure_rate_panel["targets"][0]["expr"]
|
||||||
assert "0 *" in failure_rate_panel["targets"][0]["expr"]
|
assert "0 *" in failure_rate_panel["targets"][0]["expr"]
|
||||||
assert "and on(suite)" not in failure_rate_panel["targets"][0]["expr"]
|
assert "and on(suite)" not in failure_rate_panel["targets"][0]["expr"]
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user