monitoring(jobs): split testing dashboard and clean up job ops view

2026-04-12 20:05:39 -03:00 · 2026-04-12 20:05:39 -03:00 · 299a68ad95
commit 299a68ad95
parent 049a0deb04
8 changed files with 3488 additions and 2253 deletions
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -428,13 +428,59 @@ ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
 ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
    f"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}}[$__range])) / 3600"
 )
+ARIADNE_SCHEDULE_NEXT_RUN_HOURS = (
+    f"((ariadne_schedule_next_run_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}} - time()) / 3600)"
+)
 ARIADNE_SCHEDULE_LAST_STATUS = f"ariadne_schedule_last_status{{{ARIADNE_SCHEDULE_TASK_FILTER}}}"
+ARIADNE_SCHEDULE_SIGNAL_COUNT = (
+    f"count(ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}}) or on() vector(0)"
+)
+ARIADNE_SCHEDULE_STALE_WINDOW_SEC = 36 * 3600
+ARIADNE_SCHEDULE_STALE_COUNT = (
+    f"sum(((time() - ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}}) > bool {ARIADNE_SCHEDULE_STALE_WINDOW_SEC}))"
+    " or on() vector(0)"
+)
+ARIADNE_SCHEDULE_MISSING_SUCCESS_COUNT = (
+    f"count((ariadne_schedule_next_run_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}} unless on(task) "
+    f"ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}})) or on() vector(0)"
+)
+ARIADNE_SCHEDULE_FAILED_LAST_COUNT = (
+    f"sum(((1 - ariadne_schedule_last_status{{{ARIADNE_SCHEDULE_TASK_FILTER}}}) > bool 0)) or on() vector(0)"
+)
 ARIADNE_SCHEDULE_RUNS_RANGE = (
    f'sum by (task) (increase(ariadne_task_runs_total{{{ARIADNE_SCHEDULE_TASK_FILTER}}}[$__range]))'
 )
 ARIADNE_SCHEDULE_ERRORS_RANGE = (
    f'sum by (task) (increase(ariadne_task_runs_total{{status="error",{ARIADNE_SCHEDULE_TASK_FILTER}}}[$__range]))'
 )
+JENKINS_CLEANUP_SIGNAL_COUNT = (
+    "count(ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds) or on() vector(0)"
+)
+JENKINS_CLEANUP_RUNS_RANGE = (
+    "sum by (mode, status) (increase(ariadne_jenkins_workspace_cleanup_runs_total[$__range]))"
+)
+JENKINS_CLEANUP_OBJECTS_RANGE = (
+    "sum by (kind, action, mode) (increase(ariadne_jenkins_workspace_cleanup_objects_total[$__range]))"
+)
+JENKINS_CLEANUP_LAST_RUN_AGE_HOURS = (
+    "((time() - ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds) / 3600) or on() vector(999)"
+)
+JENKINS_CLEANUP_LAST_SUCCESS_AGE_HOURS = (
+    "((time() - ariadne_jenkins_workspace_cleanup_last_success_timestamp_seconds) / 3600) or on() vector(999)"
+)
+JENKINS_CLEANUP_LAST_DELETED = "ariadne_jenkins_workspace_cleanup_last_deleted_total or on() vector(0)"
+JENKINS_CLEANUP_LAST_PLANNED = "ariadne_jenkins_workspace_cleanup_last_planned_total or on() vector(0)"
+JENKINS_WORKSPACE_PV_STALE_COUNT = (
+    'sum((kube_persistentvolume_status_phase{phase=~"Released|Failed"} > bool 0) '
+    '* on(persistentvolume) group_left(claim_namespace,name) '
+    'kube_persistentvolume_claim_ref{claim_namespace="jenkins",name=~"pvc-workspace-.*"}) or on() vector(0)'
+)
+JENKINS_WORKSPACE_PV_STALE_AGE_HOURS = (
+    '((time() - kube_persistentvolume_created) / 3600) '
+    '* on(persistentvolume) group_left(claim_namespace,name) '
+    'kube_persistentvolume_claim_ref{claim_namespace="jenkins",name=~"pvc-workspace-.*"} '
+    '* on(persistentvolume) group_left() (kube_persistentvolume_status_phase{phase=~"Released|Failed"} > bool 0)'
+)
 ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
 PLATFORM_TEST_SUITE_NAMES = [
    "ariadne",
@ -1603,7 +1649,7 @@ def build_overview():
        legend_display="table",
        legend_placement="right",
        legend_calcs=["lastNotNull"],
-        links=link_to("atlas-jobs"),
+        links=link_to("atlas-testing"),
    )
    test_success["fieldConfig"]["defaults"]["min"] = 0
    test_success["fieldConfig"]["defaults"]["max"] = 100
@ -2903,6 +2949,15 @@ def build_jobs_dashboard():
            {"color": "red", "value": 48},
        ],
    }
+    old_age_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 24},
+            {"color": "orange", "value": 72},
+            {"color": "red", "value": 168},
+        ],
+    }
    recent_error_thresholds = {
        "mode": "absolute",
        "steps": [
@ -2912,7 +2967,6 @@ def build_jobs_dashboard():
            {"color": "green", "value": 24},
        ],
    }
-
    task_error_thresholds = {
        "mode": "absolute",
        "steps": [
@ -2922,6 +2976,15 @@ def build_jobs_dashboard():
            {"color": "red", "value": 5},
        ],
    }
+    count_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 1},
+            {"color": "orange", "value": 2},
+            {"color": "red", "value": 3},
+        ],
+    }
    schedule_status_thresholds = {
        "mode": "absolute",
        "steps": [
@ -2932,24 +2995,162 @@ def build_jobs_dashboard():
    }

    panels.append(
-        bargauge_panel(
+        stat_panel(
            1,
-            "Ariadne Task Errors (range)",
-            ARIADNE_TASK_ERRORS_RANGE,
-            {"h": 7, "w": 8, "x": 0, "y": 0},
+            "Schedule Metrics Exported",
+            ARIADNE_SCHEDULE_SIGNAL_COUNT,
+            {"h": 4, "w": 4, "x": 0, "y": 0},
            unit="none",
            instant=True,
-            legend="{{task}}",
-            thresholds=task_error_thresholds,
+            thresholds={
+                "mode": "absolute",
+                "steps": [
+                    {"color": "red", "value": None},
+                    {"color": "green", "value": 1},
+                ],
+            },
        )
    )
    panels.append(
+        stat_panel(
+            2,
+            "Schedule Tasks Stale (>36h)",
+            ARIADNE_SCHEDULE_STALE_COUNT,
+            {"h": 4, "w": 4, "x": 4, "y": 0},
+            unit="none",
+            thresholds=count_thresholds,
+        )
+    )
+    panels.append(
+        stat_panel(
+            3,
+            "Schedule Tasks Missing Success",
+            ARIADNE_SCHEDULE_MISSING_SUCCESS_COUNT,
+            {"h": 4, "w": 4, "x": 8, "y": 0},
+            unit="none",
+            thresholds=count_thresholds,
+        )
+    )
+    panels.append(
+        stat_panel(
+            4,
+            "Schedule Tasks Failed Last Run",
+            ARIADNE_SCHEDULE_FAILED_LAST_COUNT,
+            {"h": 4, "w": 4, "x": 12, "y": 0},
+            unit="none",
+            thresholds=count_thresholds,
+        )
+    )
+    panels.append(
+        stat_panel(
+            5,
+            "Glue Jobs Stale (>36h)",
+            GLUE_STALE_COUNT,
+            {"h": 4, "w": 4, "x": 16, "y": 0},
+            unit="none",
+            thresholds=count_thresholds,
+        )
+    )
+    panels.append(
+        stat_panel(
+            6,
+            "Jenkins Workspace PV Backlog",
+            JENKINS_WORKSPACE_PV_STALE_COUNT,
+            {"h": 4, "w": 4, "x": 20, "y": 0},
+            unit="none",
+            thresholds={
+                "mode": "absolute",
+                "steps": [
+                    {"color": "green", "value": None},
+                    {"color": "yellow", "value": 1},
+                    {"color": "orange", "value": 10},
+                    {"color": "red", "value": 25},
+                ],
+            },
+        )
+    )
+    schedule_list_panel = table_panel(
+        7,
+        "Ariadne Schedules: Last Success (h, newest first)",
+        ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS,
+        {"h": 8, "w": 12, "x": 0, "y": 4},
+        unit="h",
+        transformations=[
+            {"id": "labelsToFields", "options": {}},
+            {"id": "sortBy", "options": {"fields": ["Value"], "order": "asc"}},
+        ],
+        instant=True,
+    )
+    schedule_list_panel["description"] = "Primary schedule inventory ordered by recency so fresh jobs stay at the top."
+    panels.append(schedule_list_panel)
+    panels.append(
+        bargauge_panel(
+            8,
+            "Ariadne Schedule Last Error (hours ago)",
+            ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS,
+            {"h": 8, "w": 12, "x": 12, "y": 4},
+            unit="h",
+            instant=True,
+            legend="{{task}}",
+            sort_order="asc",
+            thresholds=recent_error_thresholds,
+            decimals=2,
+        )
+    )
+    status_panel = bargauge_panel(
+        9,
+        "Ariadne Schedule Last Status",
+        ARIADNE_SCHEDULE_LAST_STATUS,
+        {"h": 8, "w": 8, "x": 0, "y": 12},
+        unit="none",
+        instant=True,
+        legend="{{task}}",
+        sort_order="asc",
+        thresholds=schedule_status_thresholds,
+        decimals=0,
+    )
+    status_panel["description"] = "1 means the last run was ok. 0 means the last run ended in error."
+    status_panel["fieldConfig"]["defaults"]["mappings"] = [
        {
-            "id": 2,
+            "type": "value",
+            "options": {
+                "0": {"text": "error"},
+                "1": {"text": "ok"},
+            },
+        }
+    ]
+    panels.append(status_panel)
+    schedule_runs_panel = bargauge_panel(
+        10,
+        "Ariadne Schedule Runs (range)",
+        ARIADNE_SCHEDULE_RUNS_RANGE,
+        {"h": 8, "w": 8, "x": 8, "y": 12},
+        unit="none",
+        instant=True,
+        legend="{{task}}",
+        thresholds={"mode": "absolute", "steps": [{"color": "green", "value": None}]},
+    )
+    schedule_runs_panel["description"] = "Number of runs by schedule task over the selected dashboard time range."
+    panels.append(schedule_runs_panel)
+    schedule_errors_panel = bargauge_panel(
+        11,
+        "Ariadne Schedule Errors (range)",
+        ARIADNE_SCHEDULE_ERRORS_RANGE,
+        {"h": 8, "w": 8, "x": 16, "y": 12},
+        unit="none",
+        instant=True,
+        legend="{{task}}",
+        thresholds=task_error_thresholds,
+    )
+    schedule_errors_panel["description"] = "Error run count by schedule task over the selected dashboard time range."
+    panels.append(schedule_errors_panel)
+    panels.append(
+        {
+            "id": 12,
            "type": "timeseries",
            "title": "Ariadne Attempts / Failures",
            "datasource": PROM_DS,
-            "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0},
+            "gridPos": {"h": 7, "w": 12, "x": 0, "y": 20},
            "targets": [
                {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
                {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
@ -2979,10 +3180,10 @@ def build_jobs_dashboard():
    )
    panels.append(
        bargauge_panel(
-            3,
+            13,
            "One-off Job Pods (age hours)",
            ONEOFF_JOB_POD_AGE_HOURS,
-            {"h": 7, "w": 8, "x": 16, "y": 0},
+            {"h": 7, "w": 12, "x": 12, "y": 20},
            unit="h",
            instant=True,
            legend="{{namespace}}/{{pod}}",
@ -2991,186 +3192,251 @@ def build_jobs_dashboard():
            decimals=2,
        )
    )
+    panels.append(
+        bargauge_panel(
+            14,
+            "Glue Jobs Last Success (hours ago)",
+            GLUE_LAST_SUCCESS_RANGE_HOURS,
+            {"h": 6, "w": 12, "x": 0, "y": 27},
+            unit="h",
+            instant=True,
+            legend="{{namespace}}/{{cronjob}}",
+            thresholds=age_thresholds,
+            decimals=2,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            15,
+            "Glue Jobs Last Schedule (hours ago)",
+            GLUE_LAST_SCHEDULE_RANGE_HOURS,
+            {"h": 6, "w": 12, "x": 12, "y": 27},
+            unit="h",
+            instant=True,
+            legend="{{namespace}}/{{cronjob}}",
+            thresholds=age_thresholds,
+            decimals=2,
+        )
+    )
    panels.append(
        stat_panel(
-            4,
-            "Glue Jobs Stale (>36h)",
-            GLUE_STALE_COUNT,
-            {"h": 4, "w": 4, "x": 0, "y": 7},
+            16,
+            "Jenkins Cleanup Signal Present",
+            JENKINS_CLEANUP_SIGNAL_COUNT,
+            {"h": 4, "w": 4, "x": 0, "y": 33},
            unit="none",
+            instant=True,
            thresholds={
                "mode": "absolute",
                "steps": [
-                    {"color": "green", "value": None},
-                    {"color": "yellow", "value": 1},
-                    {"color": "orange", "value": 2},
-                    {"color": "red", "value": 3},
+                    {"color": "red", "value": None},
+                    {"color": "green", "value": 1},
                ],
            },
        )
    )
    panels.append(
        stat_panel(
-            5,
-            "Glue Jobs Missing Success",
-            GLUE_MISSING_COUNT,
-            {"h": 4, "w": 4, "x": 4, "y": 7},
-            unit="none",
-        )
-    )
-    panels.append(
-        stat_panel(
-            6,
-            "Glue Jobs Suspended",
-            GLUE_SUSPENDED_COUNT,
-            {"h": 4, "w": 4, "x": 8, "y": 7},
-            unit="none",
-        )
-    )
-    panels.append(
-        stat_panel(
-            7,
-            "Ariadne Task Errors (1h)",
-            ARIADNE_TASK_ERRORS_1H_TOTAL,
-            {"h": 4, "w": 4, "x": 12, "y": 7},
-            unit="none",
-        )
-    )
-    panels.append(
-        stat_panel(
-            8,
-            "Ariadne Task Errors (24h)",
-            ARIADNE_TASK_ERRORS_24H_TOTAL,
-            {"h": 4, "w": 4, "x": 16, "y": 7},
-            unit="none",
-        )
-    )
-    panels.append(
-        stat_panel(
-            9,
-            "Ariadne Task Runs (1h)",
-            ARIADNE_TASK_RUNS_1H_TOTAL,
-            {"h": 4, "w": 4, "x": 20, "y": 7},
-            unit="none",
-        )
-    )
-    panels.append(
-        bargauge_panel(
-            10,
-            "Ariadne Schedule Last Error (hours ago)",
-            ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS,
-            {"h": 6, "w": 12, "x": 0, "y": 17},
+            17,
+            "Jenkins Cleanup Last Run Age (h)",
+            JENKINS_CLEANUP_LAST_RUN_AGE_HOURS,
+            {"h": 4, "w": 4, "x": 4, "y": 33},
            unit="h",
-            instant=True,
-            legend="{{task}}",
-            thresholds=recent_error_thresholds,
            decimals=2,
-        )
-    )
-    panels.append(
-        bargauge_panel(
-            11,
-            "Ariadne Schedule Last Success (hours ago, newest first)",
-            ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS,
-            {"h": 6, "w": 12, "x": 12, "y": 17},
-            unit="h",
            instant=True,
-            legend="{{task}}",
-            sort_order="asc",
            thresholds=age_thresholds,
-            decimals=2,
        )
    )
    panels.append(
-        bargauge_panel(
-            12,
-            "Glue Jobs Last Success (hours ago)",
-            GLUE_LAST_SUCCESS_RANGE_HOURS,
-            {"h": 6, "w": 12, "x": 0, "y": 23},
+        stat_panel(
+            18,
+            "Jenkins Cleanup Last Success Age (h)",
+            JENKINS_CLEANUP_LAST_SUCCESS_AGE_HOURS,
+            {"h": 4, "w": 4, "x": 8, "y": 33},
            unit="h",
-            instant=True,
-            legend="{{namespace}}/{{cronjob}}",
-            thresholds=age_thresholds,
            decimals=2,
+            instant=True,
+            thresholds=age_thresholds,
        )
    )
    panels.append(
-        bargauge_panel(
-            13,
-            "Glue Jobs Last Schedule (hours ago)",
-            GLUE_LAST_SCHEDULE_RANGE_HOURS,
-            {"h": 6, "w": 12, "x": 12, "y": 23},
-            unit="h",
-            instant=True,
-            legend="{{namespace}}/{{cronjob}}",
-            thresholds=age_thresholds,
-            decimals=2,
-        )
-    )
-    panels.append(
-        bargauge_panel(
-            14,
-            "Ariadne Task Errors (1h)",
-            ARIADNE_TASK_ERRORS_1H,
-            {"h": 6, "w": 12, "x": 0, "y": 29},
+        stat_panel(
+            19,
+            "Jenkins Cleanup Planned (last run)",
+            JENKINS_CLEANUP_LAST_PLANNED,
+            {"h": 4, "w": 4, "x": 12, "y": 33},
            unit="none",
            instant=True,
-            legend="{{task}}",
-            thresholds=task_error_thresholds,
        )
    )
    panels.append(
-        bargauge_panel(
-            15,
-            "Ariadne Task Errors (30d)",
-            ARIADNE_TASK_ERRORS_30D,
-            {"h": 6, "w": 12, "x": 12, "y": 29},
+        stat_panel(
+            20,
+            "Jenkins Cleanup Deleted (last run)",
+            JENKINS_CLEANUP_LAST_DELETED,
+            {"h": 4, "w": 4, "x": 16, "y": 33},
            unit="none",
            instant=True,
-            legend="{{task}}",
-            thresholds=task_error_thresholds,
        )
    )
    panels.append(
-        bargauge_panel(
-            16,
+        stat_panel(
+            21,
            "Ariadne Access Requests",
            ARIADNE_ACCESS_REQUESTS,
-            {"h": 6, "w": 8, "x": 0, "y": 11},
+            {"h": 4, "w": 4, "x": 20, "y": 33},
            unit="none",
            instant=True,
-            legend="{{status}}",
        )
    )
-    coverage_panel = stat_panel(
-        17,
+    panels.append(
+        timeseries_panel(
+            22,
+            "Jenkins Cleanup Runs (range)",
+            None,
+            {"h": 7, "w": 12, "x": 0, "y": 37},
+            unit="none",
+            targets=[
+                {"refId": "A", "expr": JENKINS_CLEANUP_RUNS_RANGE, "legendFormat": "{{mode}}/{{status}}"},
+            ],
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            23,
+            "Jenkins Cleanup Objects (range)",
+            None,
+            {"h": 7, "w": 12, "x": 12, "y": 37},
+            unit="none",
+            targets=[
+                {"refId": "A", "expr": JENKINS_CLEANUP_OBJECTS_RANGE, "legendFormat": "{{kind}}/{{action}}/{{mode}}"},
+            ],
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    stale_volume_panel = bargauge_panel(
+        24,
+        "Jenkins Workspace PV Age (h, detached only)",
+        JENKINS_WORKSPACE_PV_STALE_AGE_HOURS,
+        {"h": 10, "w": 24, "x": 0, "y": 44},
+        unit="h",
+        instant=True,
+        legend="{{name}} -> {{persistentvolume}}",
+        thresholds=old_age_thresholds,
+        decimals=1,
+        limit=40,
+    )
+    stale_volume_panel["description"] = (
+        "Oldest detached Jenkins workspace volumes first. This is the direct cleanup backlog view."
+    )
+    panels.append(stale_volume_panel)
+
+    return {
+        "uid": "atlas-jobs",
+        "title": "Atlas Jobs",
+        "folderUid": PRIVATE_FOLDER,
+        "editable": True,
+        "panels": panels,
+        "time": {"from": "now-7d", "to": "now"},
+        "annotations": {"list": []},
+        "schemaVersion": 39,
+        "style": "dark",
+        "tags": ["atlas", "jobs", "glue"],
+    }
+
+
+def build_testing_dashboard():
+    panels = []
+    pass_rate_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "red", "value": None},
+            {"color": "orange", "value": 80},
+            {"color": "yellow", "value": 95},
+            {"color": "green", "value": 99},
+        ],
+    }
+    failures_thresholds = {
+        "mode": "absolute",
+        "steps": [
+            {"color": "green", "value": None},
+            {"color": "yellow", "value": 1},
+            {"color": "orange", "value": 3},
+            {"color": "red", "value": 5},
+        ],
+    }
+
+    pass_rate_panel = stat_panel(
+        1,
        "Platform Test Success Rate (30d)",
        TEST_SUCCESS_RATE,
-        {"h": 6, "w": 4, "x": 8, "y": 11},
+        {"h": 4, "w": 6, "x": 0, "y": 0},
        unit="percent",
        decimals=2,
        instant=True,
+        thresholds=pass_rate_thresholds,
    )
-    coverage_panel["description"] = "Internal rollup across Ariadne task runs and Metis build/flash outcomes."
-    panels.append(coverage_panel)
-    tests_panel = table_panel(
-        18,
+    pass_rate_panel["description"] = "Overall success rate across tracked suites over the last 30 days."
+    panels.append(pass_rate_panel)
+    failures_panel = stat_panel(
+        2,
+        "Platform Test Failures (24h)",
+        TEST_FAILURES_24H_TOTAL,
+        {"h": 4, "w": 6, "x": 6, "y": 0},
+        unit="none",
+        instant=True,
+        thresholds=failures_thresholds,
+    )
+    failures_panel["description"] = "Total failed runs in the last 24 hours."
+    panels.append(failures_panel)
+    activity_panel = table_panel(
+        3,
        "Platform Test Activity (30d)",
        PLATFORM_TEST_ACTIVITY_30D,
-        {"h": 6, "w": 12, "x": 12, "y": 11},
+        {"h": 8, "w": 12, "x": 12, "y": 0},
        unit="none",
-        transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
+        transformations=[
+            {"id": "labelsToFields", "options": {}},
+            {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}},
+        ],
        instant=True,
    )
-    tests_panel["description"] = (
-        "Atlas Overview test panels depend on this internal activity table sourced from Ariadne and Metis counters."
+    activity_panel["description"] = "Suite/status event counts over 30 days."
+    panels.append(activity_panel)
+    panels.append(
+        bargauge_panel(
+            4,
+            "Platform Test Failures by Suite (24h)",
+            PLATFORM_TEST_FAILURES_24H_BY_SUITE,
+            {"h": 8, "w": 12, "x": 0, "y": 8},
+            unit="none",
+            instant=True,
+            legend="{{suite}}",
+            thresholds=failures_thresholds,
+        )
+    )
+    panels.append(
+        bargauge_panel(
+            5,
+            "Platform Test Success Rate by Suite (24h, lowest first)",
+            PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE,
+            {"h": 8, "w": 12, "x": 12, "y": 8},
+            unit="percent",
+            instant=True,
+            legend="{{suite}}",
+            sort_order="asc",
+            thresholds=pass_rate_thresholds,
+            decimals=2,
+        )
    )
-    panels.append(tests_panel)
    suite_panel = timeseries_panel(
-        19,
+        6,
        "Platform Test Success Rate by Suite",
        None,
-        {"h": 6, "w": 16, "x": 8, "y": 17},
+        {"h": 8, "w": 24, "x": 0, "y": 16},
        unit="percent",
        targets=PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS,
        legend_display="list",
@ -3187,69 +3453,20 @@ def build_jobs_dashboard():
        "pointSize": 4,
        "spanNulls": True,
    }
-    suite_panel["description"] = (
-        "Per-run interval pass points (0-100) per suite. Existing suites: ariadne, metis, ananke; additional suites appear automatically when platform_quality_gate_runs_total is published."
-    )
+    suite_panel["description"] = "Trend line per suite. Flat gaps mean no runs in that interval."
    panels.append(suite_panel)
-    status_panel = bargauge_panel(
-        20,
-        "Ariadne Schedule Last Status",
-        ARIADNE_SCHEDULE_LAST_STATUS,
-        {"h": 8, "w": 8, "x": 0, "y": 35},
-        unit="none",
-        instant=True,
-        legend="{{task}}",
-        sort_order="asc",
-        thresholds=schedule_status_thresholds,
-        decimals=0,
-    )
-    status_panel["description"] = "1 means the last run was ok. 0 means the last run ended in error."
-    status_panel["fieldConfig"]["defaults"]["mappings"] = [
-        {
-            "type": "value",
-            "options": {
-                "0": {"text": "error"},
-                "1": {"text": "ok"},
-            },
-        }
-    ]
-    panels.append(status_panel)
-    schedule_runs_panel = bargauge_panel(
-        21,
-        "Ariadne Schedule Runs (range)",
-        ARIADNE_SCHEDULE_RUNS_RANGE,
-        {"h": 8, "w": 8, "x": 8, "y": 35},
-        unit="none",
-        instant=True,
-        legend="{{task}}",
-        thresholds={"mode": "absolute", "steps": [{"color": "green", "value": None}]},
-    )
-    schedule_runs_panel["description"] = "Number of runs by schedule task over the selected dashboard time range."
-    panels.append(schedule_runs_panel)
-    schedule_errors_panel = bargauge_panel(
-        22,
-        "Ariadne Schedule Errors (range)",
-        ARIADNE_SCHEDULE_ERRORS_RANGE,
-        {"h": 8, "w": 8, "x": 16, "y": 35},
-        unit="none",
-        instant=True,
-        legend="{{task}}",
-        thresholds=task_error_thresholds,
-    )
-    schedule_errors_panel["description"] = "Error run count by schedule task over the selected dashboard time range."
-    panels.append(schedule_errors_panel)

    return {
-        "uid": "atlas-jobs",
-        "title": "Atlas Jobs",
+        "uid": "atlas-testing",
+        "title": "Atlas Testing",
        "folderUid": PRIVATE_FOLDER,
        "editable": True,
        "panels": panels,
-        "time": {"from": "now-7d", "to": "now"},
+        "time": {"from": "now-30d", "to": "now"},
        "annotations": {"list": []},
        "schemaVersion": 39,
        "style": "dark",
-        "tags": ["atlas", "jobs", "glue"],
+        "tags": ["atlas", "testing", "quality"],
    }


@ -3529,6 +3746,10 @@ DASHBOARDS = {
        "builder": build_jobs_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
    },
+    "atlas-testing": {
+        "builder": build_testing_dashboard,
+        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml",
+    },
    "atlas-power": {
        "builder": build_power_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-power.yaml",
--- a/services/monitoring/dashboards/atlas-jobs.json
+++ b/services/monitoring/dashboards/atlas-jobs.json
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@ -1915,8 +1915,8 @@
      },
      "links": [
        {
-          "title": "Open atlas-jobs dashboard",
-          "url": "/d/atlas-jobs",
+          "title": "Open atlas-testing dashboard",
+          "url": "/d/atlas-testing",
          "targetBlank": true
        }
      ],
--- a/services/monitoring/dashboards/atlas-testing.json
+++ b/services/monitoring/dashboards/atlas-testing.json
@ -0,0 +1,462 @@
+{
+  "uid": "atlas-testing",
+  "title": "Atlas Testing",
+  "folderUid": "atlas-internal",
+  "editable": true,
+  "panels": [
+    {
+      "id": 1,
+      "type": "stat",
+      "title": "Platform Test Success Rate (30d)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 0,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "100 * ((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[30d])) or on() vector(0))), 1)",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 80
+              },
+              {
+                "color": "yellow",
+                "value": 95
+              },
+              {
+                "color": "green",
+                "value": 99
+              }
+            ]
+          },
+          "unit": "percent",
+          "custom": {
+            "displayMode": "auto"
+          },
+          "decimals": 2
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      },
+      "description": "Overall success rate across tracked suites over the last 30 days."
+    },
+    {
+      "id": 2,
+      "type": "stat",
+      "title": "Platform Test Failures (24h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 6,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "(sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status!~\"ok|passed|success\"}[24h])) or on() vector(0))",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 3
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
+            ]
+          },
+          "unit": "none",
+          "custom": {
+            "displayMode": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "center",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "value"
+      },
+      "description": "Total failed runs in the last 24 hours."
+    },
+    {
+      "id": 3,
+      "type": "table",
+      "title": "Platform Test Activity (30d)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum by (suite, status) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[30d]))",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "custom": {
+            "filterable": true
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true,
+        "columnFilters": false
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        },
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ],
+      "description": "Suite/status event counts over 30 days."
+    },
+    {
+      "id": 4,
+      "type": "bargauge",
+      "title": "Platform Test Failures by Suite (24h)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "targets": [
+        {
+          "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status!~\"ok|passed|success\"}[24h])))",
+          "refId": "A",
+          "legendFormat": "{{suite}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "none",
+          "min": 0,
+          "max": null,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "orange",
+                "value": 3
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "desc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 5,
+      "type": "bargauge",
+      "title": "Platform Test Success Rate by Suite (24h, lowest first)",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "targets": [
+        {
+          "expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
+          "refId": "A",
+          "legendFormat": "{{suite}}",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 80
+              },
+              {
+                "color": "yellow",
+                "value": 95
+              },
+              {
+                "color": "green",
+                "value": 99
+              }
+            ]
+          },
+          "decimals": 2
+        },
+        "overrides": []
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      },
+      "transformations": [
+        {
+          "id": "sortBy",
+          "options": {
+            "fields": [
+              "Value"
+            ],
+            "order": "asc"
+          }
+        }
+      ]
+    },
+    {
+      "id": 6,
+      "type": "timeseries",
+      "title": "Platform Test Success Rate by Suite",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 16
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\"}[1h]))) > 0)",
+          "legendFormat": "ariadne"
+        },
+        {
+          "refId": "B",
+          "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"metis\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"metis\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"metis\"}[1h]))) > 0)",
+          "legendFormat": "metis"
+        },
+        {
+          "refId": "C",
+          "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"ananke\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"ananke\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"ananke\"}[1h]))) > 0)",
+          "legendFormat": "ananke"
+        },
+        {
+          "refId": "D",
+          "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[1h]))) > 0)",
+          "legendFormat": "atlasbot"
+        },
+        {
+          "refId": "E",
+          "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[1h]))) > 0)",
+          "legendFormat": "lesavka"
+        },
+        {
+          "refId": "F",
+          "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[1h]))) > 0)",
+          "legendFormat": "pegasus"
+        },
+        {
+          "refId": "G",
+          "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[1h]))) > 0)",
+          "legendFormat": "soteria"
+        },
+        {
+          "refId": "H",
+          "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[1h]))) > 0)",
+          "legendFormat": "titan-iac"
+        },
+        {
+          "refId": "I",
+          "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[1h]))) > 0)",
+          "legendFormat": "bstein-home"
+        },
+        {
+          "refId": "J",
+          "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[1h]))) > 0)",
+          "legendFormat": "arcanagon"
+        },
+        {
+          "refId": "K",
+          "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\"}[1h]))) > 0)",
+          "legendFormat": "data-prepper"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "showPoints": "always",
+            "pointSize": 4,
+            "spanNulls": true
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "description": "Trend line per suite. Flat gaps mean no runs in that interval."
+    }
+  ],
+  "time": {
+    "from": "now-30d",
+    "to": "now"
+  },
+  "annotations": {
+    "list": []
+  },
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": [
+    "atlas",
+    "testing",
+    "quality"
+  ]
+}
--- a/services/monitoring/grafana-dashboard-jobs.yaml
+++ b/services/monitoring/grafana-dashboard-jobs.yaml
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@ -1924,8 +1924,8 @@ data:
          },
          "links": [
            {
-              "title": "Open atlas-jobs dashboard",
-              "url": "/d/atlas-jobs",
+              "title": "Open atlas-testing dashboard",
+              "url": "/d/atlas-testing",
              "targetBlank": true
            }
          ],
--- a/services/monitoring/grafana-dashboard-testing.yaml
+++ b/services/monitoring/grafana-dashboard-testing.yaml
@ -0,0 +1,471 @@
+# services/monitoring/grafana-dashboard-testing.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-testing
+  labels:
+    grafana_dashboard: "1"
+data:
+  atlas-testing.json: |
+    {
+      "uid": "atlas-testing",
+      "title": "Atlas Testing",
+      "folderUid": "atlas-internal",
+      "editable": true,
+      "panels": [
+        {
+          "id": 1,
+          "type": "stat",
+          "title": "Platform Test Success Rate (30d)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 0,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "100 * ((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[30d])) or on() vector(0))), 1)",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 80
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 95
+                  },
+                  {
+                    "color": "green",
+                    "value": 99
+                  }
+                ]
+              },
+              "unit": "percent",
+              "custom": {
+                "displayMode": "auto"
+              },
+              "decimals": 2
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          },
+          "description": "Overall success rate across tracked suites over the last 30 days."
+        },
+        {
+          "id": 2,
+          "type": "stat",
+          "title": "Platform Test Failures (24h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 4,
+            "w": 6,
+            "x": 6,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "(sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status!~\"ok|passed|success\"}[24h])) or on() vector(0))",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 3
+                  },
+                  {
+                    "color": "red",
+                    "value": 5
+                  }
+                ]
+              },
+              "unit": "none",
+              "custom": {
+                "displayMode": "auto"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "colorMode": "value",
+            "graphMode": "area",
+            "justifyMode": "center",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "textMode": "value"
+          },
+          "description": "Total failed runs in the last 24 hours."
+        },
+        {
+          "id": 3,
+          "type": "table",
+          "title": "Platform Test Activity (30d)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum by (suite, status) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[30d]))",
+              "refId": "A",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "custom": {
+                "filterable": true
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true,
+            "columnFilters": false
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            },
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ],
+          "description": "Suite/status event counts over 30 days."
+        },
+        {
+          "id": 4,
+          "type": "bargauge",
+          "title": "Platform Test Failures by Suite (24h)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 8
+          },
+          "targets": [
+            {
+              "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status!~\"ok|passed|success\"}[24h])))",
+              "refId": "A",
+              "legendFormat": "{{suite}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "none",
+              "min": 0,
+              "max": null,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 1
+                  },
+                  {
+                    "color": "orange",
+                    "value": 3
+                  },
+                  {
+                    "color": "red",
+                    "value": 5
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "desc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 5,
+          "type": "bargauge",
+          "title": "Platform Test Success Rate by Suite (24h, lowest first)",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 8
+          },
+          "targets": [
+            {
+              "expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
+              "refId": "A",
+              "legendFormat": "{{suite}}",
+              "instant": true
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "min": 0,
+              "max": 100,
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "red",
+                    "value": null
+                  },
+                  {
+                    "color": "orange",
+                    "value": 80
+                  },
+                  {
+                    "color": "yellow",
+                    "value": 95
+                  },
+                  {
+                    "color": "green",
+                    "value": 99
+                  }
+                ]
+              },
+              "decimals": 2
+            },
+            "overrides": []
+          },
+          "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          },
+          "transformations": [
+            {
+              "id": "sortBy",
+              "options": {
+                "fields": [
+                  "Value"
+                ],
+                "order": "asc"
+              }
+            }
+          ]
+        },
+        {
+          "id": 6,
+          "type": "timeseries",
+          "title": "Platform Test Success Rate by Suite",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 24,
+            "x": 0,
+            "y": 16
+          },
+          "targets": [
+            {
+              "refId": "A",
+              "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\"}[1h]))) > 0)",
+              "legendFormat": "ariadne"
+            },
+            {
+              "refId": "B",
+              "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"metis\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"metis\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"metis\"}[1h]))) > 0)",
+              "legendFormat": "metis"
+            },
+            {
+              "refId": "C",
+              "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"ananke\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"ananke\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"ananke\"}[1h]))) > 0)",
+              "legendFormat": "ananke"
+            },
+            {
+              "refId": "D",
+              "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[1h]))) > 0)",
+              "legendFormat": "atlasbot"
+            },
+            {
+              "refId": "E",
+              "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[1h]))) > 0)",
+              "legendFormat": "lesavka"
+            },
+            {
+              "refId": "F",
+              "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[1h]))) > 0)",
+              "legendFormat": "pegasus"
+            },
+            {
+              "refId": "G",
+              "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[1h]))) > 0)",
+              "legendFormat": "soteria"
+            },
+            {
+              "refId": "H",
+              "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[1h]))) > 0)",
+              "legendFormat": "titan-iac"
+            },
+            {
+              "refId": "I",
+              "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[1h]))) > 0)",
+              "legendFormat": "bstein-home"
+            },
+            {
+              "refId": "J",
+              "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[1h]))) > 0)",
+              "legendFormat": "arcanagon"
+            },
+            {
+              "refId": "K",
+              "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\"}[1h]))) > 0)",
+              "legendFormat": "data-prepper"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "min": 0,
+              "max": 100,
+              "custom": {
+                "drawStyle": "line",
+                "lineInterpolation": "linear",
+                "lineWidth": 2,
+                "fillOpacity": 10,
+                "showPoints": "always",
+                "pointSize": 4,
+                "spanNulls": true
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "bottom"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          },
+          "description": "Trend line per suite. Flat gaps mean no runs in that interval."
+        }
+      ],
+      "time": {
+        "from": "now-30d",
+        "to": "now"
+      },
+      "annotations": {
+        "list": []
+      },
+      "schemaVersion": 39,
+      "style": "dark",
+      "tags": [
+        "atlas",
+        "testing",
+        "quality"
+      ]
+    }
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@ -16,6 +16,7 @@ resources:
  - grafana-dashboard-power.yaml
  - grafana-dashboard-mail.yaml
  - grafana-dashboard-jobs.yaml
+  - grafana-dashboard-testing.yaml
  - dcgm-exporter.yaml
  - jetson-tegrastats-exporter.yaml
  - postmark-exporter-service.yaml