From fe37f12e32ba81a88f408dfb7f01d38b054455a3 Mon Sep 17 00:00:00 2001 From: jenkins Date: Wed, 20 May 2026 11:01:28 -0300 Subject: [PATCH] monitoring(testing): surface current gate health --- scripts/dashboards_render_atlas.py | 116 ++++++++--- scripts/tests/test_dashboards_render_atlas.py | 63 +++--- .../monitoring/dashboards/atlas-overview.json | 188 +++++++++--------- .../monitoring/dashboards/atlas-testing.json | 184 +++++++++++++++-- .../grafana-dashboard-overview.yaml | 188 +++++++++--------- .../monitoring/grafana-dashboard-testing.yaml | 184 +++++++++++++++-- .../vmalert-atlas-availability.yaml | 36 ++++ 7 files changed, 686 insertions(+), 273 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 988012ce..e4a1118e 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -803,10 +803,24 @@ PLATFORM_TEST_CHECK_ROLLUP_OK_FLAGS = ( f'clamp_max(max by (suite, check) (({PLATFORM_TEST_CHECK_ROLLUP_OK_SELECTOR}) > 0), 1) ' f'unless on(suite, check) ({PLATFORM_TEST_CHECK_ROLLUP_FAILED_FLAGS})' ) -PLATFORM_TEST_CURRENT_GATE_HEALTH_BY_SUITE = ( +PLATFORM_TEST_CURRENT_GATE_CHECK_HEALTH_BY_SUITE = ( f'(100 * sum by (suite) ({PLATFORM_TEST_CHECK_ROLLUP_OK_FLAGS}) ' f'/ clamp_min(sum by (suite) ({PLATFORM_TEST_CHECK_ROLLUP_SEEN_FLAGS}), 1))' ) +PLATFORM_TEST_CATEGORY_HEALTH_BY_SUITE = ( + 'min by (suite) (platform_quality:test_case_health_rate:percent_1h{' + f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",branch!="",' + f'branch=~"main|master|origin/main|origin/master",test!="",' + f'test!="__no_test_cases__",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"' + "})" +) +PLATFORM_TEST_CURRENT_GATE_HEALTH_BY_SUITE = ( + f'min by (suite) (({PLATFORM_TEST_CURRENT_GATE_CHECK_HEALTH_BY_SUITE}) ' + f'or ({PLATFORM_TEST_CATEGORY_HEALTH_BY_SUITE}))' +) +PLATFORM_TEST_CURRENT_GATE_HEALTH = ( + f"(avg(({PLATFORM_TEST_CURRENT_GATE_HEALTH_BY_SUITE})) or on() vector(0))" +) PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))" ANANKE_SELECTOR = 'job="ananke-power"' ANANKE_UPS_DB_NAME = "Pyrphoros" @@ -1742,6 +1756,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = { "Enclosure Climate History": "Temperature, humidity, and VPD over time; smooth movement is healthy, sharp swings need attention.", "Fan Intensity History": "Fan levels from Off to 10; warmer colors mean stronger cooling response and more thermal pressure.", "Flux Source": "Git branch Flux is applying; this should normally be the intended production branch.", + "Current Gate Health": "Current gate-check health across suites; skipped or not-applicable checks count as healthy, failures lower it.", "CI Run Success (24h)": "Percent of published quality-gate CI runs that completed successfully in 24h; this is automation health, not raw test pass rate.", "Failed Runs (24h)": "Published quality-gate runs that failed in 24h; zero is good, any value needs a look.", "Suites With Runs (24h)": "Configured suites with at least one published quality-gate run in 24h; full count means the dashboard is fresh.", @@ -1750,7 +1765,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = { "GitOps Health": "Flux readiness and suspension health over time; blue is perfect, warmer colors mean drift or pause.", "One-off Job Pods (age hours)": "Temporary job pods by age; low or empty is good, old pods usually need cleanup.", "Ariadne Run Volume": "Ariadne automation attempts and failures; attempts show activity, failures show work to investigate.", - "Test Category Pass Rate": "Pass rate by test category across all suites; blue means clean categories, warmer colors show problem areas.", + "Test Category Health": "Current category health across suites; skipped tests count as healthy, failures lower the lane.", "Jenkins Last Success (h, newest first)": "Age of recent Jenkins successes; lower is fresher and better.", "Jenkins Last Failure (h, newest first)": "Age of recent Jenkins failures; lower means a failure happened more recently.", "PVC Backup Health / Age": "Restic backup age by PVC; lower is better, very old backups mean restore risk.", @@ -1778,19 +1793,21 @@ OVERVIEW_PANEL_DESCRIPTIONS = { TESTING_PANEL_DESCRIPTIONS = { + "Current Gate Health (%)": "Average latest required gate checks passing across selected suites; this is the current quality state.", "CI Run Success Rate (24h)": "Percent of selected quality-gate CI runs that completed successfully in 24h; this is run health, not individual test pass rate.", "CI Run Success Rate (30d)": "Percent of selected quality-gate CI runs that completed successfully in 30d; higher means more stable automation.", "Failed Runs (24h)": "Selected quality-gate runs that failed in 24h; zero is good and anything else needs a look.", "CI Runs (24h)": "Selected quality-gate CI run count in 24h; zero means the dashboard may be stale.", + "Suite Freshness (24h)": "Percent of selected suites with at least one quality-gate CI run in 24h; 100% means inputs are fresh.", "Avg Coverage (%)": "Average latest line coverage for selected suites; higher means better test protection.", "Suites with LOC >500": "Selected suites with oversized source files; zero is good for maintainability.", - "Latest Gate Checks Passing by Suite": "Latest required gate checks passing by suite; this includes tests plus coverage, LOC, style, and other gates.", + "Latest Gate Health by Suite": "Latest required gate health by suite; skipped and not-applicable results are healthy, failures lower it.", "CI Run Success by Suite (24h)": "24h CI run success rate by suite; lower rows mean recent jobs failed, aborted, or could not complete cleanly.", "Coverage by Suite (Latest, gate 95)": "Latest suite coverage; 95%+ is acceptable and 100% is strongest.", "Files <=500 LOC by Suite (Latest)": "Percent of source files within the 500-line limit; higher is easier to maintain.", "CI Runs And Test Result History": "Recent CI run, coverage, LOC, and raw test-result trends for selected suites.", "CI Run Success by Suite (7d rolling)": "Seven-day rolling CI run success rate by suite; this is run completion history, not raw test pass history.", - "Test Category Pass Rate History": "Pass rate by test category; use the Suite filter to focus on one project.", + "Test Category Health History": "Health by test category; skipped tests count as healthy, failures lower the lane.", "Daily Run Volume (Selected Scope)": "Rolling daily counts of published quality-gate runs; volume explains confidence.", "Coverage History by Suite": "Coverage over time by suite; rising lines mean better test protection.", "Files <=500 LOC History by Suite": "LOC compliance over time; blue lanes mean files stay within the size limit.", @@ -2184,9 +2201,8 @@ def build_overview(): ], } overview_avg_coverage = f"(avg(({QUALITY_GATE_COVERAGE_BY_SUITE})) or on() vector(0))" - overview_loc_clean_suites = f"(sum(({QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE}) == bool 0) or on() vector(0))" - overview_category_pass_rate = ( - 'avg by (category) (platform_quality:test_case_pass_rate:percent_1h{' + overview_category_health = ( + 'avg by (category) (platform_quality:test_case_health_rate:percent_1h{' f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",branch!="",branch=~"main|master|origin/main|origin/master",' f'test!="",test!="__no_test_cases__",category=~"{PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"' "})" @@ -2449,11 +2465,11 @@ def build_overview(): flux_source["options"]["text"] = {"titleSize": 10, "valueSize": 14} panels.append(flux_source) for panel_id, title, expr, y_pos, unit, decimals, thresholds, links in [ - (151, "CI Run Success (24h)", TEST_SUCCESS_RATE_24H, 9, "percent", 1, test_success_thresholds, "atlas-testing"), - (152, "Failed Runs (24h)", TEST_FAILURES_24H_TOTAL, 11, "none", 0, failure_count_thresholds, "atlas-testing"), - (153, "Suites With Runs (24h)", PLATFORM_TEST_ACTIVE_SUITES_24H, 13, "none", 0, perfect_count_thresholds, "atlas-testing"), - (154, "Avg Coverage", overview_avg_coverage, 15, "percent", 1, test_success_thresholds, "atlas-testing"), - (155, "LOC Clean Suites", overview_loc_clean_suites, 17, "none", 0, perfect_count_thresholds, "atlas-testing"), + (151, "Current Gate Health", PLATFORM_TEST_CURRENT_GATE_HEALTH, 9, "percent", 1, test_success_thresholds, "atlas-testing"), + (152, "CI Run Success (24h)", TEST_SUCCESS_RATE_24H, 11, "percent", 1, test_success_thresholds, "atlas-testing"), + (153, "Failed Runs (24h)", TEST_FAILURES_24H_TOTAL, 13, "none", 0, failure_count_thresholds, "atlas-testing"), + (154, "Suites With Runs (24h)", PLATFORM_TEST_ACTIVE_SUITES_24H, 15, "none", 0, perfect_count_thresholds, "atlas-testing"), + (155, "Avg Coverage", overview_avg_coverage, 17, "percent", 1, test_success_thresholds, "atlas-testing"), ]: rail_panel = stat_panel( panel_id, @@ -2522,8 +2538,8 @@ def build_overview(): panels.append( state_timeline_panel( 46, - "Test Category Pass Rate", - overview_category_pass_rate, + "Test Category Health", + overview_category_health, {"h": 6, "w": 6, "x": 15, "y": 13}, unit="percent", min_value=0, @@ -2531,7 +2547,7 @@ def build_overview(): legend="{{category}}", thresholds=test_success_thresholds, links=overview_link("atlas-testing"), - description="Pass rate by major test category across all suites over the last 24 hours. Blue is clean; warmer colors show categories needing attention.", + description="Health by major test category across all suites over the last 24 hours. Skipped tests are healthy; failures and errors lower the lane.", ) ) panels[-1]["options"]["legend"] = {"displayMode": "list", "placement": "bottom", "showLegend": False} @@ -3933,10 +3949,26 @@ def build_jobs_dashboard(): current_gate_seen_checks = ( f"sum by (suite) ({current_gate_seen_vector})" ) - current_gate_health_by_suite = ( + current_gate_check_health_by_suite = ( f"((100 * ({current_gate_ok_checks}) / clamp_min(({current_gate_seen_checks}), 1)) " f"or on(suite) ({selected_suite_missing}))" ) + current_category_health_by_suite = ( + f'min by (suite) (platform_quality:test_case_health_rate:percent_1h{{suite=~"{suite_var}",' + f'branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",' + f'category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})' + ) + current_gate_health_by_suite = ( + f"((min by (suite) (({current_gate_check_health_by_suite}) " + f"or ({current_category_health_by_suite}))) or on(suite) ({selected_suite_missing}))" + ) + current_gate_health = ( + f"(avg(clamp_min(({current_gate_health_by_suite}), 0)) or on() vector(0))" + ) + suite_freshness_24h = ( + f"100 * (sum(({runs_by_suite_24h}) > bool 0) or on() vector(0)) " + f"/ clamp_min(count(({selected_suite_universe})), 1)" + ) success_history_runs = f'sum by (suite) ({platform_runs_increase(runs_selector, "7d")})' success_history_by_suite = ( f'(100 * sum by (suite) ({platform_runs_increase(runs_success_selector, "7d")}) ' @@ -4094,7 +4126,7 @@ def build_jobs_dashboard(): f'branch!="",branch=~"{branch_var}",test!="",test=~"{test_var}",test!="__no_test_cases__"}})' ) category_pass_rate_history = ( - f'avg by (category) (platform_quality:test_case_pass_rate:percent_1h{{suite=~"{suite_var}",' + f'avg by (category) (platform_quality:test_case_health_rate:percent_1h{{suite=~"{suite_var}",' f'branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",' f'category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})' ) @@ -4195,6 +4227,18 @@ def build_jobs_dashboard(): ], } + panels.append( + stat_panel( + 156, + "Current Gate Health (%)", + current_gate_health, + {"h": 5, "w": 3, "x": 0, "y": 0}, + unit="percent", + decimals=2, + instant=True, + thresholds=success_thresholds, + ) + ) panels.append( stat_panel( 2, @@ -4244,6 +4288,18 @@ def build_jobs_dashboard(): }, ) ) + panels.append( + stat_panel( + 157, + "Suite Freshness (24h)", + suite_freshness_24h, + {"h": 5, "w": 3, "x": 15, "y": 0}, + unit="percent", + decimals=0, + instant=True, + thresholds=success_thresholds, + ) + ) panels.append( stat_panel( 6, @@ -4271,7 +4327,7 @@ def build_jobs_dashboard(): panels.append( bargauge_panel( 8, - "Latest Gate Checks Passing by Suite", + "Latest Gate Health by Suite", current_gate_health_by_suite, {"h": 8, "w": 8, "x": 0, "y": 5}, unit="percent", @@ -4286,8 +4342,8 @@ def build_jobs_dashboard(): {"type": "value", "options": {"-1": {"text": "missing"}}} ] panels[-1]["description"] = ( - "Latest pass percentage across required gate checks in the daily freshness window. " - "100% means tests and supporting gates recently passed; raw per-test history is tracked separately." + "Current health by suite from required gate checks, capped by category-level test health. " + "Skipped and not-applicable results are healthy; failures and errors lower the value." ) reliability_suite_panel = bargauge_panel( 9, @@ -4502,14 +4558,14 @@ def build_jobs_dashboard(): panels.append(selected_pass_rate_panel) category_pass_rate_panel = state_timeline_panel( 153, - "Test Category Pass Rate History", + "Test Category Health History", category_pass_rate_history, {"h": 8, "w": 12, "x": 12, "y": 21}, thresholds=success_thresholds, legend="{{category}}", description=( - "Pass rate by test category from current per-test metrics. Use the Suite filter to focus one " - "project; no data means that suite has not published category-aware results yet." + "Health by test category from current per-test metrics. Use the Suite filter to focus one " + "project; skipped tests are healthy, while failures and errors lower the lane." ), ) category_pass_rate_panel["links"] = jenkins_suite_links() @@ -4734,12 +4790,14 @@ def build_jobs_dashboard(): # and legend before the operator asks for them. panel_by_id = {panel["id"]: panel for panel in panels} visible_layout = { - 2: {"h": 4, "w": 4, "x": 0, "y": 0}, - 3: {"h": 4, "w": 4, "x": 4, "y": 0}, - 4: {"h": 4, "w": 4, "x": 8, "y": 0}, - 5: {"h": 4, "w": 4, "x": 12, "y": 0}, - 6: {"h": 4, "w": 4, "x": 16, "y": 0}, - 7: {"h": 4, "w": 4, "x": 20, "y": 0}, + 156: {"h": 4, "w": 3, "x": 0, "y": 0}, + 2: {"h": 4, "w": 3, "x": 3, "y": 0}, + 3: {"h": 4, "w": 3, "x": 6, "y": 0}, + 4: {"h": 4, "w": 3, "x": 9, "y": 0}, + 5: {"h": 4, "w": 3, "x": 12, "y": 0}, + 157: {"h": 4, "w": 3, "x": 15, "y": 0}, + 6: {"h": 4, "w": 3, "x": 18, "y": 0}, + 7: {"h": 4, "w": 3, "x": 21, "y": 0}, 8: {"h": 7, "w": 6, "x": 0, "y": 4}, 9: {"h": 7, "w": 6, "x": 6, "y": 4}, 17: {"h": 7, "w": 6, "x": 12, "y": 4}, diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index 97b13d21..0505283c 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -72,14 +72,14 @@ def test_overview_uses_readable_quality_power_and_gitops_panels(): ] assert "atlas-jobs" not in repr(dashboard) assert "Platform Test Success Rate" not in panels_by_title - assert panels_by_title["Test Category Pass Rate"]["type"] == "state-timeline" - assert panels_by_title["Test Category Pass Rate"]["gridPos"] == {"h": 6, "w": 6, "x": 15, "y": 13} - assert panels_by_title["Test Category Pass Rate"]["targets"][0]["legendFormat"] == "{{category}}" - assert "${overview_suite:regex}" not in panels_by_title["Test Category Pass Rate"]["targets"][0]["expr"] - assert mod.PLATFORM_TEST_SUITE_CANONICAL_MATCHER in panels_by_title["Test Category Pass Rate"]["targets"][0]["expr"] - assert "platform_quality:test_case_pass_rate:percent_1h" in panels_by_title["Test Category Pass Rate"]["targets"][0]["expr"] - assert panels_by_title["Test Category Pass Rate"]["timeFrom"] == "24h" - assert f'category=~"{mod.PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"' in panels_by_title["Test Category Pass Rate"]["targets"][0]["expr"] + assert panels_by_title["Test Category Health"]["type"] == "state-timeline" + assert panels_by_title["Test Category Health"]["gridPos"] == {"h": 6, "w": 6, "x": 15, "y": 13} + assert panels_by_title["Test Category Health"]["targets"][0]["legendFormat"] == "{{category}}" + assert "${overview_suite:regex}" not in panels_by_title["Test Category Health"]["targets"][0]["expr"] + assert mod.PLATFORM_TEST_SUITE_CANONICAL_MATCHER in panels_by_title["Test Category Health"]["targets"][0]["expr"] + assert "platform_quality:test_case_health_rate:percent_1h" in panels_by_title["Test Category Health"]["targets"][0]["expr"] + assert panels_by_title["Test Category Health"]["timeFrom"] == "24h" + assert f'category=~"{mod.PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"' in panels_by_title["Test Category Health"]["targets"][0]["expr"] assert "manual" not in mod.PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX assert "unit" not in mod.PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX assert panels_by_title["UPS History (Power Draw)"]["gridPos"] == {"h": 6, "w": 6, "x": 3, "y": 7} @@ -124,25 +124,28 @@ def test_overview_uses_readable_quality_power_and_gitops_panels(): assert panels_by_title["Flux Source"]["type"] == "stat" assert panels_by_title["Flux Source"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 7} assert panels_by_title["Flux Source"]["targets"][0]["legendFormat"] == "{{branch}}" - assert panels_by_title["CI Run Success (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 9} - assert panels_by_title["Suites With Runs (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 13} + assert panels_by_title["Current Gate Health"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 9} + assert "platform_quality:test_case_health_rate:percent_1h" in panels_by_title["Current Gate Health"]["targets"][0]["expr"] + assert panels_by_title["CI Run Success (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 11} + assert panels_by_title["Suites With Runs (24h)"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 15} suites_reporting_expr = panels_by_title["Suites With Runs (24h)"]["targets"][0]["expr"] assert "> bool 0" in suites_reporting_expr assert mod.PLATFORM_TEST_SUITE_CANONICAL_MATCHER in suites_reporting_expr assert "bstein-home" not in suites_reporting_expr assert "published quality-gate run" in panels_by_title["Suites With Runs (24h)"]["description"] - assert panels_by_title["LOC Clean Suites"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 17} + assert panels_by_title["Avg Coverage"]["gridPos"] == {"h": 2, "w": 3, "x": 21, "y": 17} + assert "LOC Clean Suites" not in panels_by_title assert panels_by_title["GitOps Health"]["type"] == "state-timeline" assert panels_by_title["GitOps Health"]["gridPos"] == {"h": 6, "w": 6, "x": 15, "y": 7} gitops_expr = panels_by_title["GitOps Health"]["targets"][0]["expr"] assert "Kustomizations Not Suspended" in gitops_expr assert "HelmReleases Not Suspended" in gitops_expr - assert panels_by_title["Test Category Pass Rate"]["type"] == "state-timeline" - assert panels_by_title["Test Category Pass Rate"]["options"]["legend"]["showLegend"] is False - assert panels_by_title["Test Category Pass Rate"]["options"]["mergeValues"] is False - assert panels_by_title["Test Category Pass Rate"]["options"]["showValue"] == "auto" - assert panels_by_title["Test Category Pass Rate"]["options"]["rowHeight"] == 0.9 - assert panels_by_title["Test Category Pass Rate"]["targets"][0]["legendFormat"] == "{{category}}" + assert panels_by_title["Test Category Health"]["type"] == "state-timeline" + assert panels_by_title["Test Category Health"]["options"]["legend"]["showLegend"] is False + assert panels_by_title["Test Category Health"]["options"]["mergeValues"] is False + assert panels_by_title["Test Category Health"]["options"]["showValue"] == "auto" + assert panels_by_title["Test Category Health"]["options"]["rowHeight"] == 0.9 + assert panels_by_title["Test Category Health"]["targets"][0]["legendFormat"] == "{{category}}" assert not any(variable["name"] == "overview_suite" for variable in dashboard["templating"]["list"]) pvc_backup_expr = panels_by_title["PVC Backup Health / Age"]["targets"][0]["expr"] @@ -214,7 +217,10 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability(): dashboard = mod.build_jobs_dashboard() panels_by_title = {panel["title"]: panel for panel in flatten_panels(dashboard["panels"])} - assert "Latest Gate Checks Passing by Suite" in panels_by_title + assert "Current Gate Health (%)" in panels_by_title + assert "Suite Freshness (24h)" in panels_by_title + assert "Latest Gate Health by Suite" in panels_by_title + assert "Latest Gate Checks Passing by Suite" not in panels_by_title assert "CI Run Success by Suite (24h)" in panels_by_title assert "CI Run Success by Suite (7d rolling)" in panels_by_title assert "Daily Run Volume (Selected Scope)" in panels_by_title @@ -227,9 +233,10 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability(): assert "Failures by Suite (24h)" not in panels_by_title assert "Success Rate by Suite (24h)" not in panels_by_title - current_gate_expr = panels_by_title["Latest Gate Checks Passing by Suite"]["targets"][0]["expr"] + current_gate_expr = panels_by_title["Latest Gate Health by Suite"]["targets"][0]["expr"] assert 'check)' in current_gate_expr assert "platform_quality:check_status:present_1h" in current_gate_expr + assert "platform_quality:test_case_health_rate:percent_1h" in current_gate_expr assert '.*_quality_gate_checks_total' not in current_gate_expr assert "last_over_time" not in current_gate_expr assert 'label_replace' not in current_gate_expr @@ -237,7 +244,10 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability(): assert 'status!~"ok|passed|success|not_applicable|skipped|na|n/a"' in current_gate_expr assert "unless on(suite, check)" in current_gate_expr assert "tlast_over_time" not in current_gate_expr - assert panels_by_title["Latest Gate Checks Passing by Suite"]["gridPos"]["w"] == 6 + assert panels_by_title["Current Gate Health (%)"]["gridPos"] == {"h": 4, "w": 3, "x": 0, "y": 0} + assert "platform_quality:test_case_health_rate:percent_1h" in panels_by_title["Current Gate Health (%)"]["targets"][0]["expr"] + assert panels_by_title["Suite Freshness (24h)"]["gridPos"] == {"h": 4, "w": 3, "x": 15, "y": 0} + assert panels_by_title["Latest Gate Health by Suite"]["gridPos"]["w"] == 6 assert panels_by_title["CI Run Success by Suite (24h)"]["gridPos"]["w"] == 6 assert panels_by_title["Coverage by Suite (Latest, gate 95)"]["gridPos"] == {"h": 7, "w": 6, "x": 12, "y": 4} assert panels_by_title["Files <=500 LOC by Suite (Latest)"]["gridPos"] == {"h": 7, "w": 6, "x": 18, "y": 4} @@ -255,13 +265,14 @@ def test_jobs_dashboard_separates_current_gate_health_from_reliability(): rolling_panel = panels_by_title["CI Run Success by Suite (7d rolling)"] assert rolling_panel["type"] == "state-timeline" assert "[7d:1m]" in rolling_panel["targets"][0]["expr"] - category_panel = panels_by_title["Test Category Pass Rate History"] + category_panel = panels_by_title["Test Category Health History"] assert category_panel["type"] == "state-timeline" assert "category" in category_panel["targets"][0]["expr"] + assert "platform_quality:test_case_health_rate:percent_1h" in category_panel["targets"][0]["expr"] assert f'category=~"{mod.PLATFORM_TEST_CATEGORY_REGEX}"' in category_panel["targets"][0]["expr"] assert "installer" not in mod.PLATFORM_TEST_CATEGORY_REGEX assert "Use the Suite filter" in category_panel["description"] - assert "category-aware results" in category_panel["description"] + assert "skipped tests are healthy" in category_panel["description"] coverage_panel = panels_by_title["Coverage History by Suite"] loc_panel = panels_by_title["Files <=500 LOC History by Suite"] @@ -311,9 +322,9 @@ def test_jobs_dashboard_collapses_heavy_drilldowns_for_light_first_paint(): for child in row.get("panels", []) } - assert len(panels) == 16 - assert len(visible_query_panels) == 10 - assert sum(len(panel.get("targets", [])) for panel in visible_query_panels) == 10 + assert len(panels) == 18 + assert len(visible_query_panels) == 12 + assert sum(len(panel.get("targets", [])) for panel in visible_query_panels) == 12 assert all( panel["title"] != "Coverage Gap to 95% by Suite" for panel in visible_query_panels @@ -330,7 +341,7 @@ def test_jobs_dashboard_collapses_heavy_drilldowns_for_light_first_paint(): assert "Coverage Failure Rate" in nested_panels_by_title assert "Supply Chain Healthy Rate" in nested_panels_by_title - assert "Test Category Pass Rate History" in nested_panels_by_title + assert "Test Category Health History" in nested_panels_by_title assert "Selected Test Pass Rate History" in nested_panels_by_title assert "Coverage Metrics Present by Suite" in nested_panels_by_title assert "SonarQube API Up" in nested_panels_by_title diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 217803be..24487d39 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -2114,7 +2114,7 @@ { "id": 151, "type": "stat", - "title": "CI Run Success (24h)", + "title": "Current Gate Health", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2125,6 +2125,92 @@ "x": 21, "y": 9 }, + "targets": [ + { + "expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "dark-orange", + "value": 70 + }, + { + "color": "dark-yellow", + "value": 85 + }, + { + "color": "dark-green", + "value": 95 + }, + { + "color": "dark-blue", + "value": 100 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value", + "text": { + "titleSize": 10, + "valueSize": 19 + } + }, + "links": [ + { + "title": "Open atlas-testing dashboard", + "url": "/d/atlas-testing", + "targetBlank": true + } + ], + "description": "Current gate-check health across suites; skipped or not-applicable checks count as healthy, failures lower it." + }, + { + "id": 152, + "type": "stat", + "title": "CI Run Success (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 21, + "y": 11 + }, "targets": [ { "expr": "100 * ((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|typhon|bstein_home|bstein-home|data_prepper|data-prepper\",status=~\"ok|passed|success\",exported_job=\"platform-quality-ci\"}))[24h:1m])) or on() vector(0))) / clamp_min(((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|typhon|bstein_home|bstein-home|data_prepper|data-prepper\",exported_job=\"platform-quality-ci\"}))[24h:1m])) or on() vector(0))), 1)", @@ -2198,7 +2284,7 @@ "description": "Percent of published quality-gate CI runs that completed successfully in 24h; this is automation health, not raw test pass rate." }, { - "id": 152, + "id": 153, "type": "stat", "title": "Failed Runs (24h)", "datasource": { @@ -2209,7 +2295,7 @@ "h": 2, "w": 3, "x": 21, - "y": 11 + "y": 13 }, "targets": [ { @@ -2280,7 +2366,7 @@ "description": "Published quality-gate runs that failed in 24h; zero is good, any value needs a look." }, { - "id": 153, + "id": 154, "type": "stat", "title": "Suites With Runs (24h)", "datasource": { @@ -2291,7 +2377,7 @@ "h": 2, "w": 3, "x": 21, - "y": 13 + "y": 15 }, "targets": [ { @@ -2362,7 +2448,7 @@ "description": "Configured suites with at least one published quality-gate run in 24h; full count means the dashboard is fresh." }, { - "id": 154, + "id": 155, "type": "stat", "title": "Avg Coverage", "datasource": { @@ -2373,7 +2459,7 @@ "h": 2, "w": 3, "x": 21, - "y": 15 + "y": 17 }, "targets": [ { @@ -2447,88 +2533,6 @@ ], "description": "Average latest line coverage across suites; higher means code is better protected by tests." }, - { - "id": 155, - "type": "stat", - "title": "LOC Clean Suites", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 2, - "w": 3, - "x": 21, - "y": 17 - }, - "targets": [ - { - "expr": "(sum((max by (suite) ((last_over_time(platform_quality_gate_source_lines_over_500_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\"}[30d])) and (topk by (suite) (1, tlast_over_time(platform_quality_gate_source_lines_over_500_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\"}[30d]))))) == bool 0) or on() vector(0))", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "dark-red", - "value": null - }, - { - "color": "dark-yellow", - "value": 9 - }, - { - "color": "dark-green", - "value": 10 - }, - { - "color": "dark-blue", - "value": 11 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - }, - "decimals": 0 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value", - "text": { - "titleSize": 10, - "valueSize": 19 - } - }, - "links": [ - { - "title": "Open atlas-testing dashboard", - "url": "/d/atlas-testing", - "targetBlank": true - } - ], - "description": "Suites with no source files over 500 LOC; full count is good for maintainability." - }, { "id": 150, "type": "state-timeline", @@ -2795,8 +2799,8 @@ { "id": 46, "type": "state-timeline", - "title": "Test Category Pass Rate", - "description": "Pass rate by major test category across all suites over the last 24 hours. Blue is clean; warmer colors show categories needing attention.", + "title": "Test Category Health", + "description": "Health by major test category across all suites over the last 24 hours. Skipped tests are healthy; failures and errors lower the lane.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2809,7 +2813,7 @@ }, "targets": [ { - "expr": "avg by (category) (platform_quality:test_case_pass_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})", + "expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})", "refId": "A", "legendFormat": "{{category}}" } diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index 63fde54c..8cca49fe 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -4,6 +4,81 @@ "folderUid": "atlas-public", "editable": false, "panels": [ + { + "id": 156, + "type": "stat", + "title": "Current Gate Health (%)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "(avg(clamp_min((((min by (suite) ((((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1)) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0) or (count by (suite) (max_over_time(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0)))) - 1)))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0) or (count by (suite) (max_over_time(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0)))) - 1)))), 0)) or on() vector(0))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "dark-orange", + "value": 90 + }, + { + "color": "dark-yellow", + "value": 93 + }, + { + "color": "dark-green", + "value": 95 + }, + { + "color": "dark-blue", + "value": 100 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "description": "Average latest required gate checks passing across selected suites; this is the current quality state." + }, { "id": 2, "type": "stat", @@ -14,8 +89,8 @@ }, "gridPos": { "h": 4, - "w": 4, - "x": 0, + "w": 3, + "x": 3, "y": 0 }, "targets": [ @@ -89,8 +164,8 @@ }, "gridPos": { "h": 4, - "w": 4, - "x": 4, + "w": 3, + "x": 6, "y": 0 }, "targets": [ @@ -164,8 +239,8 @@ }, "gridPos": { "h": 4, - "w": 4, - "x": 8, + "w": 3, + "x": 9, "y": 0 }, "targets": [ @@ -238,7 +313,7 @@ }, "gridPos": { "h": 4, - "w": 4, + "w": 3, "x": 12, "y": 0 }, @@ -290,6 +365,81 @@ }, "description": "Selected quality-gate CI run count in 24h; zero means the dashboard may be stale." }, + { + "id": 157, + "type": "stat", + "title": "Suite Freshness (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 0 + }, + "targets": [ + { + "expr": "100 * (sum((sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[24h:1m]))) > bool 0) or on() vector(0)) / clamp_min(count(((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0) or (count by (suite) (max_over_time(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))), 1)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "dark-orange", + "value": 90 + }, + { + "color": "dark-yellow", + "value": 93 + }, + { + "color": "dark-green", + "value": 95 + }, + { + "color": "dark-blue", + "value": 100 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "description": "Percent of selected suites with at least one quality-gate CI run in 24h; 100% means inputs are fresh." + }, { "id": 6, "type": "stat", @@ -300,8 +450,8 @@ }, "gridPos": { "h": 4, - "w": 4, - "x": 16, + "w": 3, + "x": 18, "y": 0 }, "targets": [ @@ -375,8 +525,8 @@ }, "gridPos": { "h": 4, - "w": 4, - "x": 20, + "w": 3, + "x": 21, "y": 0 }, "targets": [ @@ -442,7 +592,7 @@ { "id": 8, "type": "bargauge", - "title": "Latest Gate Checks Passing by Suite", + "title": "Latest Gate Health by Suite", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -455,7 +605,7 @@ }, "targets": [ { - "expr": "sort(((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1)) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0) or (count by (suite) (max_over_time(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0)))) - 1))))", + "expr": "sort(((min by (suite) ((((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1)) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0) or (count by (suite) (max_over_time(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0)))) - 1)))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0) or (count by (suite) (max_over_time(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0)))) - 1))))", "refId": "A", "legendFormat": "{{suite}}", "instant": true @@ -530,7 +680,7 @@ } } ], - "description": "Latest pass percentage across required gate checks in the daily freshness window. 100% means tests and supporting gates recently passed; raw per-test history is tracked separately." + "description": "Current health by suite from required gate checks, capped by category-level test health. Skipped and not-applicable results are healthy; failures and errors lower the value." }, { "id": 9, @@ -902,8 +1052,8 @@ { "id": 153, "type": "state-timeline", - "title": "Test Category Pass Rate History", - "description": "Pass rate by test category from current per-test metrics. Use the Suite filter to focus one project; no data means that suite has not published category-aware results yet.", + "title": "Test Category Health History", + "description": "Health by test category from current per-test metrics. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -916,7 +1066,7 @@ }, "targets": [ { - "expr": "avg by (category) (platform_quality:test_case_pass_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})", + "expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})", "refId": "A", "legendFormat": "{{category}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index f568ec3b..ef55f91a 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -2123,7 +2123,7 @@ data: { "id": 151, "type": "stat", - "title": "CI Run Success (24h)", + "title": "Current Gate Health", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2134,6 +2134,92 @@ data: "x": 21, "y": 9 }, + "targets": [ + { + "expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "dark-orange", + "value": 70 + }, + { + "color": "dark-yellow", + "value": 85 + }, + { + "color": "dark-green", + "value": 95 + }, + { + "color": "dark-blue", + "value": 100 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value", + "text": { + "titleSize": 10, + "valueSize": 19 + } + }, + "links": [ + { + "title": "Open atlas-testing dashboard", + "url": "/d/atlas-testing", + "targetBlank": true + } + ], + "description": "Current gate-check health across suites; skipped or not-applicable checks count as healthy, failures lower it." + }, + { + "id": 152, + "type": "stat", + "title": "CI Run Success (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 21, + "y": 11 + }, "targets": [ { "expr": "100 * ((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|typhon|bstein_home|bstein-home|data_prepper|data-prepper\",status=~\"ok|passed|success\",exported_job=\"platform-quality-ci\"}))[24h:1m])) or on() vector(0))) / clamp_min(((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|typhon|bstein_home|bstein-home|data_prepper|data-prepper\",exported_job=\"platform-quality-ci\"}))[24h:1m])) or on() vector(0))), 1)", @@ -2207,7 +2293,7 @@ data: "description": "Percent of published quality-gate CI runs that completed successfully in 24h; this is automation health, not raw test pass rate." }, { - "id": 152, + "id": 153, "type": "stat", "title": "Failed Runs (24h)", "datasource": { @@ -2218,7 +2304,7 @@ data: "h": 2, "w": 3, "x": 21, - "y": 11 + "y": 13 }, "targets": [ { @@ -2289,7 +2375,7 @@ data: "description": "Published quality-gate runs that failed in 24h; zero is good, any value needs a look." }, { - "id": 153, + "id": 154, "type": "stat", "title": "Suites With Runs (24h)", "datasource": { @@ -2300,7 +2386,7 @@ data: "h": 2, "w": 3, "x": 21, - "y": 13 + "y": 15 }, "targets": [ { @@ -2371,7 +2457,7 @@ data: "description": "Configured suites with at least one published quality-gate run in 24h; full count means the dashboard is fresh." }, { - "id": 154, + "id": 155, "type": "stat", "title": "Avg Coverage", "datasource": { @@ -2382,7 +2468,7 @@ data: "h": 2, "w": 3, "x": 21, - "y": 15 + "y": 17 }, "targets": [ { @@ -2456,88 +2542,6 @@ data: ], "description": "Average latest line coverage across suites; higher means code is better protected by tests." }, - { - "id": 155, - "type": "stat", - "title": "LOC Clean Suites", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 2, - "w": 3, - "x": 21, - "y": 17 - }, - "targets": [ - { - "expr": "(sum((max by (suite) ((last_over_time(platform_quality_gate_source_lines_over_500_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\"}[30d])) and (topk by (suite) (1, tlast_over_time(platform_quality_gate_source_lines_over_500_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",exported_job=\"platform-quality-ci\"}[30d]))))) == bool 0) or on() vector(0))", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "dark-red", - "value": null - }, - { - "color": "dark-yellow", - "value": 9 - }, - { - "color": "dark-green", - "value": 10 - }, - { - "color": "dark-blue", - "value": 11 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - }, - "decimals": 0 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value", - "text": { - "titleSize": 10, - "valueSize": 19 - } - }, - "links": [ - { - "title": "Open atlas-testing dashboard", - "url": "/d/atlas-testing", - "targetBlank": true - } - ], - "description": "Suites with no source files over 500 LOC; full count is good for maintainability." - }, { "id": 150, "type": "state-timeline", @@ -2804,8 +2808,8 @@ data: { "id": 46, "type": "state-timeline", - "title": "Test Category Pass Rate", - "description": "Pass rate by major test category across all suites over the last 24 hours. Blue is clean; warmer colors show categories needing attention.", + "title": "Test Category Health", + "description": "Health by major test category across all suites over the last 24 hours. Skipped tests are healthy; failures and errors lower the lane.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2818,7 +2822,7 @@ data: }, "targets": [ { - "expr": "avg by (category) (platform_quality:test_case_pass_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})", + "expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan_iac|typhon|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})", "refId": "A", "legendFormat": "{{category}}" } diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 2684b39b..6697c056 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -13,6 +13,81 @@ data: "folderUid": "atlas-public", "editable": false, "panels": [ + { + "id": 156, + "type": "stat", + "title": "Current Gate Health (%)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "(avg(clamp_min((((min by (suite) ((((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1)) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0) or (count by (suite) (max_over_time(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0)))) - 1)))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0) or (count by (suite) (max_over_time(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0)))) - 1)))), 0)) or on() vector(0))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "dark-orange", + "value": 90 + }, + { + "color": "dark-yellow", + "value": 93 + }, + { + "color": "dark-green", + "value": 95 + }, + { + "color": "dark-blue", + "value": 100 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "description": "Average latest required gate checks passing across selected suites; this is the current quality state." + }, { "id": 2, "type": "stat", @@ -23,8 +98,8 @@ data: }, "gridPos": { "h": 4, - "w": 4, - "x": 0, + "w": 3, + "x": 3, "y": 0 }, "targets": [ @@ -98,8 +173,8 @@ data: }, "gridPos": { "h": 4, - "w": 4, - "x": 4, + "w": 3, + "x": 6, "y": 0 }, "targets": [ @@ -173,8 +248,8 @@ data: }, "gridPos": { "h": 4, - "w": 4, - "x": 8, + "w": 3, + "x": 9, "y": 0 }, "targets": [ @@ -247,7 +322,7 @@ data: }, "gridPos": { "h": 4, - "w": 4, + "w": 3, "x": 12, "y": 0 }, @@ -299,6 +374,81 @@ data: }, "description": "Selected quality-gate CI run count in 24h; zero means the dashboard may be stale." }, + { + "id": 157, + "type": "stat", + "title": "Suite Freshness (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 0 + }, + "targets": [ + { + "expr": "100 * (sum((sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[24h:1m]))) > bool 0) or on() vector(0)) / clamp_min(count(((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0) or (count by (suite) (max_over_time(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0))), 1)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-red", + "value": null + }, + { + "color": "dark-orange", + "value": 90 + }, + { + "color": "dark-yellow", + "value": 93 + }, + { + "color": "dark-green", + "value": 95 + }, + { + "color": "dark-blue", + "value": 100 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "description": "Percent of selected suites with at least one quality-gate CI run in 24h; 100% means inputs are fresh." + }, { "id": 6, "type": "stat", @@ -309,8 +459,8 @@ data: }, "gridPos": { "h": 4, - "w": 4, - "x": 16, + "w": 3, + "x": 18, "y": 0 }, "targets": [ @@ -384,8 +534,8 @@ data: }, "gridPos": { "h": 4, - "w": 4, - "x": 20, + "w": 3, + "x": 21, "y": 0 }, "targets": [ @@ -451,7 +601,7 @@ data: { "id": 8, "type": "bargauge", - "title": "Latest Gate Checks Passing by Suite", + "title": "Latest Gate Health by Suite", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -464,7 +614,7 @@ data: }, "targets": [ { - "expr": "sort(((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1)) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0) or (count by (suite) (max_over_time(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0)))) - 1))))", + "expr": "sort(((min by (suite) ((((100 * (sum by (suite) (((clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1)) unless on(suite, check) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) > 0), 1))))) / clamp_min((sum by (suite) (clamp_max(max by (suite, check) ((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain\",status!=\"\"})) > 0), 1))), 1)) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0) or (count by (suite) (max_over_time(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0)))) - 1)))) or (min by (suite) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})))) or on(suite) ((((0 * ((count by (suite) (platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}) >= bool 0) or (count by (suite) (max_over_time(platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}[30d])) >= bool 0)))) - 1))))", "refId": "A", "legendFormat": "{{suite}}", "instant": true @@ -539,7 +689,7 @@ data: } } ], - "description": "Latest pass percentage across required gate checks in the daily freshness window. 100% means tests and supporting gates recently passed; raw per-test history is tracked separately." + "description": "Current health by suite from required gate checks, capped by category-level test health. Skipped and not-applicable results are healthy; failures and errors lower the value." }, { "id": 9, @@ -911,8 +1061,8 @@ data: { "id": 153, "type": "state-timeline", - "title": "Test Category Pass Rate History", - "description": "Pass rate by test category from current per-test metrics. Use the Suite filter to focus one project; no data means that suite has not published category-aware results yet.", + "title": "Test Category Health History", + "description": "Health by test category from current per-test metrics. Use the Suite filter to focus one project; skipped tests are healthy, while failures and errors lower the lane.", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -925,7 +1075,7 @@ data: }, "targets": [ { - "expr": "avg by (category) (platform_quality:test_case_pass_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})", + "expr": "avg by (category) (platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"})", "refId": "A", "legendFormat": "{{category}}" } diff --git a/services/monitoring/vmalert-atlas-availability.yaml b/services/monitoring/vmalert-atlas-availability.yaml index add627ab..f9fd7bb6 100644 --- a/services/monitoring/vmalert-atlas-availability.yaml +++ b/services/monitoring/vmalert-atlas-availability.yaml @@ -191,6 +191,42 @@ data: ) labels: rollup: hourly + - record: platform_quality:test_case_health_rate:percent_1h + expr: | + 100 * ( + ( + sum by (suite, branch, test, category) ( + platform_quality_gate_test_case_result{exported_job="platform-quality-ci",branch!="",test!="",test!="__no_test_cases__",category=~"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit",status=~"passed|skipped|not_applicable|na|n/a"} + or label_replace( + platform_quality_gate_test_case_result{exported_job="platform-quality-ci",branch!="",test!="",test!="__no_test_cases__",category="",status=~"passed|skipped|not_applicable|na|n/a"}, + "category", "uncategorized", "__name__", ".*" + ) + ) + or on(suite, branch, test, category) + ( + 0 * sum by (suite, branch, test, category) ( + platform_quality_gate_test_case_result{exported_job="platform-quality-ci",branch!="",test!="",test!="__no_test_cases__",category=~"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit",status=~"passed|failed|error|skipped|not_applicable|na|n/a"} + or label_replace( + platform_quality_gate_test_case_result{exported_job="platform-quality-ci",branch!="",test!="",test!="__no_test_cases__",category="",status=~"passed|failed|error|skipped|not_applicable|na|n/a"}, + "category", "uncategorized", "__name__", ".*" + ) + ) + ) + ) + ) + / + clamp_min( + sum by (suite, branch, test, category) ( + platform_quality_gate_test_case_result{exported_job="platform-quality-ci",branch!="",test!="",test!="__no_test_cases__",category=~"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit",status=~"passed|failed|error|skipped|not_applicable|na|n/a"} + or label_replace( + platform_quality_gate_test_case_result{exported_job="platform-quality-ci",branch!="",test!="",test!="__no_test_cases__",category="",status=~"passed|failed|error|skipped|not_applicable|na|n/a"}, + "category", "uncategorized", "__name__", ".*" + ) + ), + 1 + ) + labels: + rollup: hourly - record: platform_quality:check_status:present_1h expr: | sum by (suite, branch, check, status) (