From 75d002dc885cdb9e7e1b40a1bbf1c7240174b9cf Mon Sep 17 00:00:00 2001 From: jenkins Date: Fri, 5 Jun 2026 13:14:51 -0300 Subject: [PATCH] monitoring(testing): cap expensive dashboard queries --- scripts/dashboards_render_atlas.py | 84 +++++++++++-------- .../monitoring/dashboards/atlas-testing.json | 34 ++++---- .../monitoring/grafana-dashboard-testing.yaml | 34 ++++---- 3 files changed, 88 insertions(+), 64 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 52a7eb93..04fbb1df 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -676,17 +676,23 @@ PLATFORM_TEST_SOURCE_LINES_OVER_500_ROLLUP = ( ) PLATFORM_TEST_SONAR_HEALTH_ROLLUP = "platform_quality:sonar_gate_health_percent:latest_1h" PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP = "platform_quality:test_category_health_rate:percent_1h" +PLATFORM_TEST_HISTORY_WINDOW = "7d" +PLATFORM_TEST_HISTORY_STEP = "1h" +PLATFORM_TEST_BRANCH_EVIDENCE_WINDOW = "7d" +PLATFORM_TEST_BRANCH_EVIDENCE_STEP = "1h" +PLATFORM_TEST_CASE_DISCOVERY_WINDOW = "24h" +PLATFORM_TEST_CASE_PANEL_WINDOW = "24h" PLATFORM_TEST_SUCCESS_EVENTS_30D = ( - f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")}) or on() vector(0))' + f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) or on() vector(0))' ) PLATFORM_TEST_TOTAL_EVENTS_30D = ( - f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")}) or on() vector(0))' + f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) or on() vector(0))' ) PLATFORM_TEST_SUCCESS_EVENTS_7D = ( - f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", "7d")}) or on() vector(0))' + f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) or on() vector(0))' ) PLATFORM_TEST_TOTAL_EVENTS_7D = ( - f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "7d")}) or on() vector(0))' + f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) or on() vector(0))' ) PLATFORM_TEST_SUCCESS_EVENTS_24H = ( f'(sum({PLATFORM_TEST_RUNS_24H_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}"}}) or on() vector(0))' @@ -710,7 +716,7 @@ PLATFORM_TEST_FAILURES_24H_BY_SUITE = ( f'sort_desc(sum by (suite) ({PLATFORM_TEST_RUNS_24H_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status!~"{PLATFORM_TEST_SUCCESS_STATUS}"}}))' ) PLATFORM_TEST_ACTIVITY_30D = ( - f'sum by (suite, status) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")})' + f'sum by (suite, status) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)})' ) PLATFORM_TEST_RUNS_24H_TOTAL = PLATFORM_TEST_TOTAL_EVENTS_24H PLATFORM_TEST_ACTIVE_SUITES_24H = ( @@ -719,7 +725,7 @@ PLATFORM_TEST_ACTIVE_SUITES_24H = ( ) PLATFORM_TEST_POINT_WINDOW = "1h" PLATFORM_TEST_FRESH_WINDOW = "30h" -PLATFORM_TEST_LATEST_WINDOW = "30d" +PLATFORM_TEST_LATEST_WINDOW = "7d" def platform_check_status_expr( @@ -832,7 +838,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = ( f'/ clamp_min((sum by (suite) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "24h")})), 1))' ) QUALITY_GATE_SUITE_INDEX_30D = ( - f'sum by (suite) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")})' + f'sum by (suite) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)})' ) QUALITY_GATE_COVERAGE_BY_SUITE = ( f'max by (suite) ({PLATFORM_TEST_COVERAGE_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}"}})' @@ -1569,11 +1575,11 @@ def testing_case_variable(): "label": "Test Case", "type": "query", "query": ( - "query_result(topk(250, count by (test) (max_over_time(" + "query_result(topk(75, count by (test) (max_over_time(" f'platform_quality:test_case_health_rate:percent_1h{{suite=~"${{suite:regex}}",branch!="",' f'branch=~"${{branch:regex}}",test!="",test!="__no_test_cases__",' f'category!~"{PLATFORM_TEST_SUPPORT_CATEGORY_REGEX}"}}' - "[$__range]))))" + f"[{PLATFORM_TEST_CASE_DISCOVERY_WINDOW}:{PLATFORM_TEST_HISTORY_STEP}]))))" ), "regex": '/test="([^"]+)"/', "current": {"text": "All", "value": "$__all", "selected": True}, @@ -1914,7 +1920,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = { TESTING_PANEL_DESCRIPTIONS = { "Current Gate Health (%)": "Average latest required gate checks passing across selected suites; this is the current quality state.", "CI Run Success Rate (24h)": "Percent of selected quality-gate CI runs that completed successfully in 24h; this is run health, not individual test pass rate.", - "CI Run Success Rate (30d)": "Percent of selected quality-gate CI runs that completed successfully in 30d; higher means more stable automation.", + "CI Run Success Rate (7d)": "Percent of selected quality-gate CI runs that completed successfully in 7d; higher means more stable automation.", "Failed Runs (24h)": "Selected quality-gate runs that failed in 24h; zero is good and anything else needs a look.", "CI Runs (24h)": "Selected quality-gate CI run count in 24h; zero means the dashboard may be stale.", "Suite Freshness (24h)": "Percent of selected suites with at least one quality-gate CI run in 24h; 100% means inputs are fresh.", @@ -1948,7 +1954,7 @@ TESTING_PANEL_DESCRIPTIONS = { "Supply Chain Healthy Rate": "Percent of supply-chain checks passing or not applicable; higher is better.", "Test Drilldowns And Problem Tests": "Test-case detail for finding which tests are hurting reliability.", "Problematic Tests Over Time (Top failures)": "Current outlier tests by rolling 24h failures; tests need repeat failures to stay visible.", - "Most Problematic Test by Suite (30d)": "Worst test per suite summed over 30d; high counts can be historical debt.", + "Most Problematic Test by Suite (7d)": "Worst test per suite summed over 7d; high counts can be historical debt.", "Selected Test Pass/Fail History": "Hourly pass/fail/skipped volume for the selected test filter.", "Selected Test Pass Rate History": "Pass rate history for the selected test filter; higher means the test is stable.", "Telemetry Completeness And Branches": "Checks that each suite publishes the data this dashboard needs.", @@ -1958,8 +1964,8 @@ TESTING_PANEL_DESCRIPTIONS = { "LOC Compliance Metrics Present by Suite": "Whether LOC metrics are present; 100% means size panels are reliable.", "Test-Case Metrics Present by Suite": "Whether per-test metrics are present; 100% enables drilldowns.", "Real Test Cases Present by Suite": "Whether real test names are present; 100% means not just placeholder telemetry.", - "Recent Branch Evidence by Suite (30d)": "Branches with recent CI evidence; unexpected branches can mean drift or stale work.", - "Primary Branch Clean by Suite (30d)": "Percent clean of non-primary branch evidence; 100% means only main/master is reporting.", + "Recent Branch Evidence by Suite (7d)": "Branches with recent CI evidence; unexpected branches can mean drift or stale work.", + "Primary Branch Clean by Suite (7d)": "Percent clean of non-primary branch evidence; 100% means only main/master is reporting.", "SonarQube Project Health": "SonarQube availability, projects, fetch errors, and gate status.", "SonarQube API Up": "Whether the SonarQube exporter can reach SonarQube; 1 is good.", "Sonar Projects (Selected)": "Selected SonarQube project count; zero means Sonar is not tracking that suite.", @@ -4044,14 +4050,18 @@ def build_jobs_dashboard(): f'branch=~"{branch_var}",status!~"{success}"}}' ) runs_24h = f'(sum({runs_24h_rollup_selector}) or on() vector(0))' - runs_30d = f'(sum({platform_runs_increase(runs_selector, "30d", "15m")}) or on() vector(0))' + runs_history = ( + f'(sum({platform_runs_increase(runs_selector, PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) ' + "or on() vector(0))" + ) success_24h = f'(sum({runs_24h_success_rollup_selector}) or on() vector(0))' - success_30d = ( - f'(sum({platform_runs_increase(runs_success_selector, "30d", "15m")}) or on() vector(0))' + success_history_total = ( + f'(sum({platform_runs_increase(runs_success_selector, PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) ' + "or on() vector(0))" ) failures_24h = f'(sum({runs_24h_failure_rollup_selector}) or on() vector(0))' success_rate_24h = f"100 * ({success_24h}) / clamp_min(({runs_24h}), 1)" - success_rate_30d = f"100 * ({success_30d}) / clamp_min(({runs_30d}), 1)" + success_rate_history = f"100 * ({success_history_total}) / clamp_min(({runs_history}), 1)" runs_by_suite_24h = f"sum by (suite) ({runs_24h_rollup_selector})" success_by_suite_24h = f"sum by (suite) ({runs_24h_success_rollup_selector})" success_rate_by_suite_24h = ( @@ -4089,9 +4099,11 @@ def build_jobs_dashboard(): f"100 * (sum(({runs_by_suite_24h}) > bool 0) or on() vector(0)) " f"/ clamp_min(count(({selected_suite_universe})), 1)" ) - success_history_runs = f'sum by (suite) ({platform_runs_increase(runs_selector, "7d")})' + success_history_runs = ( + f"sum by (suite) ({platform_runs_increase(runs_selector, PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)})" + ) success_history_by_suite = ( - f'(100 * sum by (suite) ({platform_runs_increase(runs_success_selector, "7d")}) ' + f"(100 * sum by (suite) ({platform_runs_increase(runs_success_selector, PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) " f'/ ({success_history_runs})) and on(suite) (({success_history_runs}) > 0)' ) daily_success_volume = ( @@ -4178,11 +4190,11 @@ def build_jobs_dashboard(): f"and on (suite, test) topk(12, ({current_problem_test_candidates}) >= 2)" ) problematic_tests_history = problematic_tests_history_core - rollup_failed_tests_30d = ( - f'sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}[30d:1h]))' + rollup_failed_tests_history = ( + f'sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}[{PLATFORM_TEST_HISTORY_WINDOW}:{PLATFORM_TEST_HISTORY_STEP}]))' ) worst_test_per_suite_core = ( - f"topk by (suite) (1, ({rollup_failed_tests_30d}))" + f"topk by (suite) (1, ({rollup_failed_tests_history}))" ) worst_test_per_suite = worst_test_per_suite_core @@ -4219,13 +4231,13 @@ def build_jobs_dashboard(): f'branch!="",branch=~"{branch_var}",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})' ) recent_branch_evidence = ( - f'sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[30d:15m])))' + f'sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[{PLATFORM_TEST_BRANCH_EVIDENCE_WINDOW}:{PLATFORM_TEST_BRANCH_EVIDENCE_STEP}])))' ) non_primary_branch_evidence = ( - f'count by (suite) (max_over_time(platform_quality_gate_build_info{{{build_info_selector},branch!~"main|master|origin/main|origin/master|unknown"}}[30d:15m]))' + f'count by (suite) (max_over_time(platform_quality_gate_build_info{{{build_info_selector},branch!~"main|master|origin/main|origin/master|unknown"}}[{PLATFORM_TEST_BRANCH_EVIDENCE_WINDOW}:{PLATFORM_TEST_BRANCH_EVIDENCE_STEP}]))' ) branch_evidence_by_suite = ( - f'count by (suite) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[30d:15m]))' + f'count by (suite) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[{PLATFORM_TEST_BRANCH_EVIDENCE_WINDOW}:{PLATFORM_TEST_BRANCH_EVIDENCE_STEP}]))' ) primary_branch_clean_by_suite = ( f'(100 * ((({branch_evidence_by_suite}) > bool 0) ' @@ -4342,8 +4354,8 @@ def build_jobs_dashboard(): panels.append( stat_panel( 3, - "CI Run Success Rate (30d)", - success_rate_30d, + "CI Run Success Rate (7d)", + success_rate_history, {"h": 5, "w": 4, "x": 4, "y": 0}, unit="percent", decimals=2, @@ -4464,6 +4476,7 @@ def build_jobs_dashboard(): "so failed or aborted runs lower the lane color without implying raw test failures." ), ) + history_panel["timeFrom"] = PLATFORM_TEST_HISTORY_WINDOW panels.append(history_panel) run_volume_panel = timeseries_panel( @@ -4580,12 +4593,13 @@ def build_jobs_dashboard(): ), ) ) + panels[-1]["timeFrom"] = PLATFORM_TEST_CASE_PANEL_WINDOW panels[-1]["links"] = jenkins_suite_links() panels[-1]["fieldConfig"]["defaults"]["links"] = jenkins_latest_artifact_data_links() panels.append( bargauge_panel( 147, - "Most Problematic Test by Suite (30d)", + "Most Problematic Test by Suite (7d)", worst_test_per_suite, {"h": 8, "w": 12, "x": 12, "y": 57}, unit="none", @@ -4599,8 +4613,8 @@ def build_jobs_dashboard(): ) ) panels[-1]["description"] = ( - "Worst test per suite summed across 30d. This catches historical repeat offenders even when the " - "current hourly top list is quiet." + "Worst test per suite summed across 7d. This catches repeat offenders while keeping dashboard " + "loads bounded; current hourly top list is quiet." ) panels.append( timeseries_panel( @@ -4619,8 +4633,9 @@ def build_jobs_dashboard(): ) panels[-1]["description"] = ( "Stacked hourly outcome volume for the selected suite/branch/test scope. " - "This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans." + "This uses vmalert rollups only, avoiding expensive raw long-range per-test scans." ) + panels[-1]["timeFrom"] = PLATFORM_TEST_CASE_PANEL_WINDOW panels[-1]["fieldConfig"]["defaults"]["min"] = 0 panels[-1]["fieldConfig"]["defaults"]["custom"] = { "drawStyle": "bars", @@ -4641,6 +4656,7 @@ def build_jobs_dashboard(): "test-case pass-rate rollups instead of raw historical scans." ), ) + selected_pass_rate_panel["timeFrom"] = PLATFORM_TEST_CASE_PANEL_WINDOW selected_pass_rate_panel["links"] = jenkins_suite_links() selected_pass_rate_panel["fieldConfig"]["defaults"]["links"] = jenkins_artifact_data_links() panels.append(selected_pass_rate_panel) @@ -4845,7 +4861,7 @@ def build_jobs_dashboard(): panels.append( bargauge_panel( 149, - "Recent Branch Evidence by Suite (30d)", + "Recent Branch Evidence by Suite (7d)", recent_branch_evidence, {"h": 7, "w": 12, "x": 0, "y": 100}, unit="none", @@ -4860,7 +4876,7 @@ def build_jobs_dashboard(): panels.append( bargauge_panel( 150, - "Primary Branch Clean by Suite (30d)", + "Primary Branch Clean by Suite (7d)", primary_branch_clean_by_suite, {"h": 7, "w": 12, "x": 12, "y": 100}, unit="percent", @@ -4972,7 +4988,7 @@ def build_jobs_dashboard(): "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, - "time": {"from": "now-30d", "to": "now"}, + "time": {"from": "now-24h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index ce1a74cd..f2d1efec 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -157,7 +157,7 @@ { "id": 3, "type": "stat", - "title": "CI Run Success Rate (30d)", + "title": "CI Run Success Rate (7d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -170,7 +170,7 @@ }, "targets": [ { - "expr": "100 * ((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[30d:15m])) or on() vector(0))) / clamp_min(((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[30d:15m])) or on() vector(0))), 1)", + "expr": "100 * ((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[7d:1h])) or on() vector(0))) / clamp_min(((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1h])) or on() vector(0))), 1)", "refId": "A", "instant": true } @@ -227,7 +227,7 @@ }, "textMode": "value" }, - "description": "Percent of selected quality-gate CI runs that completed successfully in 30d; higher means more stable automation." + "description": "Percent of selected quality-gate CI runs that completed successfully in 7d; higher means more stable automation." }, { "id": 4, @@ -990,7 +990,7 @@ }, "targets": [ { - "expr": "(100 * sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[7d:1m])) / (sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1m])))) and on(suite) ((sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1m]))) > 0)", + "expr": "(100 * sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[7d:1h])) / (sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1h])))) and on(suite) ((sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1h]))) > 0)", "refId": "A", "legendFormat": "{{suite}}", "format": "time_series", @@ -1050,7 +1050,8 @@ "mode": "single", "sort": "none" } - } + }, + "timeFrom": "7d" }, { "id": 153, @@ -2690,6 +2691,7 @@ "sort": "none" } }, + "timeFrom": "24h", "links": [ { "title": "Open Jenkins", @@ -2801,7 +2803,7 @@ { "id": 147, "type": "bargauge", - "title": "Most Problematic Test by Suite (30d)", + "title": "Most Problematic Test by Suite (7d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2814,7 +2816,7 @@ }, "targets": [ { - "expr": "sort_desc(topk by (suite) (1, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[30d:1h])))))", + "expr": "sort_desc(topk by (suite) (1, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[7d:1h])))))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{test}}", "instant": true @@ -3003,7 +3005,7 @@ } } ], - "description": "Worst test per suite summed across 30d. This catches historical repeat offenders even when the current hourly top list is quiet." + "description": "Worst test per suite summed across 7d. This catches repeat offenders while keeping dashboard loads bounded; current hourly top list is quiet." }, { "id": 146, @@ -3181,7 +3183,8 @@ "targetBlank": true } ], - "description": "Stacked hourly outcome volume for the selected suite/branch/test scope. This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans." + "description": "Stacked hourly outcome volume for the selected suite/branch/test scope. This uses vmalert rollups only, avoiding expensive raw long-range per-test scans.", + "timeFrom": "24h" }, { "id": 152, @@ -3273,6 +3276,7 @@ "sort": "none" } }, + "timeFrom": "24h", "links": [ { "title": "Open Jenkins", @@ -3897,7 +3901,7 @@ { "id": 150, "type": "bargauge", - "title": "Primary Branch Clean by Suite (30d)", + "title": "Primary Branch Clean by Suite (7d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -3910,7 +3914,7 @@ }, "targets": [ { - "expr": "sort((100 * (((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d:15m]))) > bool 0) unless on(suite) ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[30d:15m]))) > bool 0))) or on(suite) (0 * ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d:15m]))) > bool 0)))", + "expr": "sort((100 * (((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[7d:1h]))) > bool 0) unless on(suite) ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[7d:1h]))) > bool 0))) or on(suite) (0 * ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[7d:1h]))) > bool 0)))", "refId": "A", "legendFormat": "{{suite}}", "instant": true @@ -4087,7 +4091,7 @@ { "id": 149, "type": "bargauge", - "title": "Recent Branch Evidence by Suite (30d)", + "title": "Recent Branch Evidence by Suite (7d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -4100,7 +4104,7 @@ }, "targets": [ { - "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d:15m])))", + "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[7d:1h])))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{branch}}", "instant": true @@ -4623,7 +4627,7 @@ } ], "time": { - "from": "now-30d", + "from": "now-24h", "to": "now" }, "annotations": { @@ -4758,7 +4762,7 @@ "name": "test", "label": "Test Case", "type": "query", - "query": "query_result(topk(250, count by (test) (max_over_time(platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category!~\"fixtures|golden|helpers\"}[$__range]))))", + "query": "query_result(topk(75, count by (test) (max_over_time(platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category!~\"fixtures|golden|helpers\"}[24h:1h]))))", "regex": "/test=\"([^\"]+)\"/", "current": { "text": "All", diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 51ecc1a2..a7307388 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -166,7 +166,7 @@ data: { "id": 3, "type": "stat", - "title": "CI Run Success Rate (30d)", + "title": "CI Run Success Rate (7d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -179,7 +179,7 @@ data: }, "targets": [ { - "expr": "100 * ((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[30d:15m])) or on() vector(0))) / clamp_min(((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[30d:15m])) or on() vector(0))), 1)", + "expr": "100 * ((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[7d:1h])) or on() vector(0))) / clamp_min(((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1h])) or on() vector(0))), 1)", "refId": "A", "instant": true } @@ -236,7 +236,7 @@ data: }, "textMode": "value" }, - "description": "Percent of selected quality-gate CI runs that completed successfully in 30d; higher means more stable automation." + "description": "Percent of selected quality-gate CI runs that completed successfully in 7d; higher means more stable automation." }, { "id": 4, @@ -999,7 +999,7 @@ data: }, "targets": [ { - "expr": "(100 * sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[7d:1m])) / (sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1m])))) and on(suite) ((sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1m]))) > 0)", + "expr": "(100 * sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[7d:1h])) / (sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1h])))) and on(suite) ((sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1h]))) > 0)", "refId": "A", "legendFormat": "{{suite}}", "format": "time_series", @@ -1059,7 +1059,8 @@ data: "mode": "single", "sort": "none" } - } + }, + "timeFrom": "7d" }, { "id": 153, @@ -2699,6 +2700,7 @@ data: "sort": "none" } }, + "timeFrom": "24h", "links": [ { "title": "Open Jenkins", @@ -2810,7 +2812,7 @@ data: { "id": 147, "type": "bargauge", - "title": "Most Problematic Test by Suite (30d)", + "title": "Most Problematic Test by Suite (7d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -2823,7 +2825,7 @@ data: }, "targets": [ { - "expr": "sort_desc(topk by (suite) (1, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[30d:1h])))))", + "expr": "sort_desc(topk by (suite) (1, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[7d:1h])))))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{test}}", "instant": true @@ -3012,7 +3014,7 @@ data: } } ], - "description": "Worst test per suite summed across 30d. This catches historical repeat offenders even when the current hourly top list is quiet." + "description": "Worst test per suite summed across 7d. This catches repeat offenders while keeping dashboard loads bounded; current hourly top list is quiet." }, { "id": 146, @@ -3190,7 +3192,8 @@ data: "targetBlank": true } ], - "description": "Stacked hourly outcome volume for the selected suite/branch/test scope. This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans." + "description": "Stacked hourly outcome volume for the selected suite/branch/test scope. This uses vmalert rollups only, avoiding expensive raw long-range per-test scans.", + "timeFrom": "24h" }, { "id": 152, @@ -3282,6 +3285,7 @@ data: "sort": "none" } }, + "timeFrom": "24h", "links": [ { "title": "Open Jenkins", @@ -3906,7 +3910,7 @@ data: { "id": 150, "type": "bargauge", - "title": "Primary Branch Clean by Suite (30d)", + "title": "Primary Branch Clean by Suite (7d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -3919,7 +3923,7 @@ data: }, "targets": [ { - "expr": "sort((100 * (((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d:15m]))) > bool 0) unless on(suite) ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[30d:15m]))) > bool 0))) or on(suite) (0 * ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d:15m]))) > bool 0)))", + "expr": "sort((100 * (((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[7d:1h]))) > bool 0) unless on(suite) ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[7d:1h]))) > bool 0))) or on(suite) (0 * ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[7d:1h]))) > bool 0)))", "refId": "A", "legendFormat": "{{suite}}", "instant": true @@ -4096,7 +4100,7 @@ data: { "id": 149, "type": "bargauge", - "title": "Recent Branch Evidence by Suite (30d)", + "title": "Recent Branch Evidence by Suite (7d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -4109,7 +4113,7 @@ data: }, "targets": [ { - "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d:15m])))", + "expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[7d:1h])))", "refId": "A", "legendFormat": "{{suite}} \u00b7 {{branch}}", "instant": true @@ -4632,7 +4636,7 @@ data: } ], "time": { - "from": "now-30d", + "from": "now-24h", "to": "now" }, "annotations": { @@ -4767,7 +4771,7 @@ data: "name": "test", "label": "Test Case", "type": "query", - "query": "query_result(topk(250, count by (test) (max_over_time(platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category!~\"fixtures|golden|helpers\"}[$__range]))))", + "query": "query_result(topk(75, count by (test) (max_over_time(platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category!~\"fixtures|golden|helpers\"}[24h:1h]))))", "regex": "/test=\"([^\"]+)\"/", "current": { "text": "All",