diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 317765a6..5ac7d16c 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -446,20 +446,36 @@ PLATFORM_TEST_ACTIVITY_30D = ( 'or label_replace(sum by (status) (increase(metis_flashes_total[30d])), "source", "metis-flash", "__name__", ".*") ' 'or label_replace(sum by (status) (increase(ananke_quality_gate_runs_total{suite="ananke"}[30d])), "source", "ananke-quality", "__name__", ".*")' ) -PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES = ( - 'label_replace(100 * (sum by (task) (increase(ariadne_task_runs_total{status="ok"}[$__interval])) ' - '/ clamp_min(sum by (task) (increase(ariadne_task_runs_total[$__interval])), 1)), ' - '"suite", "ariadne:${1}", "task", "(.*)") ' - 'or label_replace(100 * (sum by (node) (increase(metis_builds_total{status="ok"}[$__interval])) ' - '/ clamp_min(sum by (node) (increase(metis_builds_total[$__interval])), 1)), ' - '"suite", "metis-build:${1}", "node", "(.*)") ' - 'or label_replace(100 * (sum by (node) (increase(metis_flashes_total{status="ok"}[$__interval])) ' - '/ clamp_min(sum by (node) (increase(metis_flashes_total[$__interval])), 1)), ' - '"suite", "metis-flash:${1}", "node", "(.*)") ' - 'or label_replace(100 * (sum by (instance) (increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[$__interval])) ' - '/ clamp_min(sum by (instance) (increase(ananke_quality_gate_runs_total{suite="ananke"}[$__interval])), 1)), ' - '"suite", "ananke-quality:${1}", "instance", "(.*)")' +ARIADNE_SUITE_OK_INTERVAL = 'sum(increase(ariadne_task_runs_total{status="ok"}[$__interval]))' +ARIADNE_SUITE_TOTAL_INTERVAL = 'sum(increase(ariadne_task_runs_total[$__interval]))' +METIS_SUITE_OK_INTERVAL = ( + '(sum(increase(metis_builds_total{status="ok"}[$__interval])) + ' + 'sum(increase(metis_flashes_total{status="ok"}[$__interval])))' ) +METIS_SUITE_TOTAL_INTERVAL = ( + '(sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))' +) +ANANKE_SUITE_OK_INTERVAL = 'sum(increase(ananke_quality_gate_runs_total{suite="ananke",status="ok"}[$__interval]))' +ANANKE_SUITE_TOTAL_INTERVAL = 'sum(increase(ananke_quality_gate_runs_total{suite="ananke"}[$__interval]))' + +PLATFORM_TEST_SUCCESS_RATE_ARIADNE_SERIES = ( + f'(100 * ({ARIADNE_SUITE_OK_INTERVAL}) / ({ARIADNE_SUITE_TOTAL_INTERVAL})) ' + f'and on() (({ARIADNE_SUITE_TOTAL_INTERVAL}) > 0)' +) +PLATFORM_TEST_SUCCESS_RATE_METIS_SERIES = ( + f'(100 * ({METIS_SUITE_OK_INTERVAL}) / ({METIS_SUITE_TOTAL_INTERVAL})) ' + f'and on() (({METIS_SUITE_TOTAL_INTERVAL}) > 0)' +) +PLATFORM_TEST_SUCCESS_RATE_ANANKE_SERIES = ( + f'(100 * ({ANANKE_SUITE_OK_INTERVAL}) / ({ANANKE_SUITE_TOTAL_INTERVAL})) ' + f'and on() (({ANANKE_SUITE_TOTAL_INTERVAL}) > 0)' +) + +PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS = [ + {"refId": "A", "expr": PLATFORM_TEST_SUCCESS_RATE_ARIADNE_SERIES, "legendFormat": "ariadne"}, + {"refId": "B", "expr": PLATFORM_TEST_SUCCESS_RATE_METIS_SERIES, "legendFormat": "metis"}, + {"refId": "C", "expr": PLATFORM_TEST_SUCCESS_RATE_ANANKE_SERIES, "legendFormat": "ananke"}, +] ANANKE_SELECTOR = 'job="ananke-power"' ANANKE_UPS_DB_NAME = "Pyrphoros" ANANKE_UPS_DB_NODE = "titan-db" @@ -1482,13 +1498,7 @@ def build_overview(): None, {"h": 5, "w": 6, "x": 12, "y": 7}, unit="percent", - targets=[ - { - "refId": "A", - "expr": PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES, - "legendFormat": "{{suite}}", - } - ], + targets=PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS, legend_display="table", legend_placement="right", legend_calcs=["last"], @@ -1497,7 +1507,7 @@ def build_overview(): test_success["fieldConfig"]["defaults"]["min"] = 0 test_success["fieldConfig"]["defaults"]["max"] = 100 test_success["description"] = ( - "Per-test rolling pass rate (0-100) across Ariadne tasks, Metis build/flash node runs, and Ananke quality-gate runs." + "Application-level rolling pass rate (0-100). One line per suite (ariadne, metis, ananke); idle windows are left blank rather than forced to 0%." ) panels.append(test_success) test_failures = stat_panel( @@ -2969,20 +2979,14 @@ def build_jobs_dashboard(): None, {"h": 6, "w": 16, "x": 8, "y": 17}, unit="percent", - targets=[ - { - "refId": "A", - "expr": PLATFORM_TEST_SUCCESS_RATE_BY_SUITE_SERIES, - "legendFormat": "{{suite}}", - } - ], + targets=PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS, legend_display="list", legend_placement="bottom", ) suite_panel["fieldConfig"]["defaults"]["min"] = 0 suite_panel["fieldConfig"]["defaults"]["max"] = 100 suite_panel["description"] = ( - "Per-test/per-node pass percentage over time across Ariadne, Metis, and Ananke quality suites." + "Application-level pass percentage over time. One series per suite: ariadne, metis, ananke." ) panels.append(suite_panel) diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 94cfb6ab..0a3da12c 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -1253,8 +1253,18 @@ "targets": [ { "refId": "A", - "expr": "label_replace(100 * (sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (task) (increase(ariadne_task_runs_total[$__interval])), 1)), \"suite\", \"ariadne:${1}\", \"task\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_builds_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_builds_total[$__interval])), 1)), \"suite\", \"metis-build:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_flashes_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_flashes_total[$__interval])), 1)), \"suite\", \"metis-flash:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) / clamp_min(sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])), 1)), \"suite\", \"ananke-quality:${1}\", \"instance\", \"(.*)\")", - "legendFormat": "{{suite}}" + "expr": "(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval]))) / (sum(increase(ariadne_task_runs_total[$__interval])))) and on() ((sum(increase(ariadne_task_runs_total[$__interval]))) > 0)", + "legendFormat": "ariadne" + }, + { + "refId": "B", + "expr": "(100 * ((sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) + sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])))) / ((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval]))))) and on() (((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))) > 0)", + "legendFormat": "metis" + }, + { + "refId": "C", + "expr": "(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval]))) / (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])))) and on() ((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval]))) > 0)", + "legendFormat": "ananke" } ], "fieldConfig": { @@ -1274,7 +1284,7 @@ "mode": "multi" } }, - "description": "Per-test/per-node pass percentage over time across Ariadne, Metis, and Ananke quality suites." + "description": "Application-level pass percentage over time. One series per suite: ariadne, metis, ananke." } ], "time": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 8cd32cc9..f4552a3a 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1820,8 +1820,18 @@ "targets": [ { "refId": "A", - "expr": "label_replace(100 * (sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (task) (increase(ariadne_task_runs_total[$__interval])), 1)), \"suite\", \"ariadne:${1}\", \"task\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_builds_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_builds_total[$__interval])), 1)), \"suite\", \"metis-build:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_flashes_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_flashes_total[$__interval])), 1)), \"suite\", \"metis-flash:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) / clamp_min(sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])), 1)), \"suite\", \"ananke-quality:${1}\", \"instance\", \"(.*)\")", - "legendFormat": "{{suite}}" + "expr": "(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval]))) / (sum(increase(ariadne_task_runs_total[$__interval])))) and on() ((sum(increase(ariadne_task_runs_total[$__interval]))) > 0)", + "legendFormat": "ariadne" + }, + { + "refId": "B", + "expr": "(100 * ((sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) + sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])))) / ((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval]))))) and on() (((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))) > 0)", + "legendFormat": "metis" + }, + { + "refId": "C", + "expr": "(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval]))) / (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])))) and on() ((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval]))) > 0)", + "legendFormat": "ananke" } ], "fieldConfig": { @@ -1851,7 +1861,7 @@ "targetBlank": true } ], - "description": "Per-test rolling pass rate (0-100) across Ariadne tasks, Metis build/flash node runs, and Ananke quality-gate runs." + "description": "Application-level rolling pass rate (0-100). One line per suite (ariadne, metis, ananke); idle windows are left blank rather than forced to 0%." }, { "id": 47, diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index e4f0fb4d..fc825a3a 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -1262,8 +1262,18 @@ data: "targets": [ { "refId": "A", - "expr": "label_replace(100 * (sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (task) (increase(ariadne_task_runs_total[$__interval])), 1)), \"suite\", \"ariadne:${1}\", \"task\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_builds_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_builds_total[$__interval])), 1)), \"suite\", \"metis-build:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_flashes_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_flashes_total[$__interval])), 1)), \"suite\", \"metis-flash:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) / clamp_min(sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])), 1)), \"suite\", \"ananke-quality:${1}\", \"instance\", \"(.*)\")", - "legendFormat": "{{suite}}" + "expr": "(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval]))) / (sum(increase(ariadne_task_runs_total[$__interval])))) and on() ((sum(increase(ariadne_task_runs_total[$__interval]))) > 0)", + "legendFormat": "ariadne" + }, + { + "refId": "B", + "expr": "(100 * ((sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) + sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])))) / ((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval]))))) and on() (((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))) > 0)", + "legendFormat": "metis" + }, + { + "refId": "C", + "expr": "(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval]))) / (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])))) and on() ((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval]))) > 0)", + "legendFormat": "ananke" } ], "fieldConfig": { @@ -1283,7 +1293,7 @@ data: "mode": "multi" } }, - "description": "Per-test/per-node pass percentage over time across Ariadne, Metis, and Ananke quality suites." + "description": "Application-level pass percentage over time. One series per suite: ariadne, metis, ananke." } ], "time": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 9f30d15c..c58c47f0 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1829,8 +1829,18 @@ data: "targets": [ { "refId": "A", - "expr": "label_replace(100 * (sum by (task) (increase(ariadne_task_runs_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (task) (increase(ariadne_task_runs_total[$__interval])), 1)), \"suite\", \"ariadne:${1}\", \"task\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_builds_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_builds_total[$__interval])), 1)), \"suite\", \"metis-build:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (node) (increase(metis_flashes_total{status=\"ok\"}[$__interval])) / clamp_min(sum by (node) (increase(metis_flashes_total[$__interval])), 1)), \"suite\", \"metis-flash:${1}\", \"node\", \"(.*)\") or label_replace(100 * (sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval])) / clamp_min(sum by (instance) (increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])), 1)), \"suite\", \"ananke-quality:${1}\", \"instance\", \"(.*)\")", - "legendFormat": "{{suite}}" + "expr": "(100 * (sum(increase(ariadne_task_runs_total{status=\"ok\"}[$__interval]))) / (sum(increase(ariadne_task_runs_total[$__interval])))) and on() ((sum(increase(ariadne_task_runs_total[$__interval]))) > 0)", + "legendFormat": "ariadne" + }, + { + "refId": "B", + "expr": "(100 * ((sum(increase(metis_builds_total{status=\"ok\"}[$__interval])) + sum(increase(metis_flashes_total{status=\"ok\"}[$__interval])))) / ((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval]))))) and on() (((sum(increase(metis_builds_total[$__interval])) + sum(increase(metis_flashes_total[$__interval])))) > 0)", + "legendFormat": "metis" + }, + { + "refId": "C", + "expr": "(100 * (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\",status=\"ok\"}[$__interval]))) / (sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval])))) and on() ((sum(increase(ananke_quality_gate_runs_total{suite=\"ananke\"}[$__interval]))) > 0)", + "legendFormat": "ananke" } ], "fieldConfig": { @@ -1860,7 +1870,7 @@ data: "targetBlank": true } ], - "description": "Per-test rolling pass rate (0-100) across Ariadne tasks, Metis build/flash node runs, and Ananke quality-gate runs." + "description": "Application-level rolling pass rate (0-100). One line per suite (ariadne, metis, ananke); idle windows are left blank rather than forced to 0%." }, { "id": 47,