From 299a68ad95231dd8fc58c549c051dddc90d58867 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 12 Apr 2026 20:05:39 -0300 Subject: [PATCH] monitoring(jobs): split testing dashboard and clean up job ops view --- scripts/dashboards_render_atlas.py | 603 +++-- .../monitoring/dashboards/atlas-jobs.json | 2098 +++++++++-------- .../monitoring/dashboards/atlas-overview.json | 4 +- .../monitoring/dashboards/atlas-testing.json | 462 ++++ .../monitoring/grafana-dashboard-jobs.yaml | 2098 +++++++++-------- .../grafana-dashboard-overview.yaml | 4 +- .../monitoring/grafana-dashboard-testing.yaml | 471 ++++ services/monitoring/kustomization.yaml | 1 + 8 files changed, 3488 insertions(+), 2253 deletions(-) create mode 100644 services/monitoring/dashboards/atlas-testing.json create mode 100644 services/monitoring/grafana-dashboard-testing.yaml diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 595fcf7c..1b4c8e44 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -428,13 +428,59 @@ ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = ( ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = ( f"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}}[$__range])) / 3600" ) +ARIADNE_SCHEDULE_NEXT_RUN_HOURS = ( + f"((ariadne_schedule_next_run_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}} - time()) / 3600)" +) ARIADNE_SCHEDULE_LAST_STATUS = f"ariadne_schedule_last_status{{{ARIADNE_SCHEDULE_TASK_FILTER}}}" +ARIADNE_SCHEDULE_SIGNAL_COUNT = ( + f"count(ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}}) or on() vector(0)" +) +ARIADNE_SCHEDULE_STALE_WINDOW_SEC = 36 * 3600 +ARIADNE_SCHEDULE_STALE_COUNT = ( + f"sum(((time() - ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}}) > bool {ARIADNE_SCHEDULE_STALE_WINDOW_SEC}))" + " or on() vector(0)" +) +ARIADNE_SCHEDULE_MISSING_SUCCESS_COUNT = ( + f"count((ariadne_schedule_next_run_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}} unless on(task) " + f"ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}})) or on() vector(0)" +) +ARIADNE_SCHEDULE_FAILED_LAST_COUNT = ( + f"sum(((1 - ariadne_schedule_last_status{{{ARIADNE_SCHEDULE_TASK_FILTER}}}) > bool 0)) or on() vector(0)" +) ARIADNE_SCHEDULE_RUNS_RANGE = ( f'sum by (task) (increase(ariadne_task_runs_total{{{ARIADNE_SCHEDULE_TASK_FILTER}}}[$__range]))' ) ARIADNE_SCHEDULE_ERRORS_RANGE = ( f'sum by (task) (increase(ariadne_task_runs_total{{status="error",{ARIADNE_SCHEDULE_TASK_FILTER}}}[$__range]))' ) +JENKINS_CLEANUP_SIGNAL_COUNT = ( + "count(ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds) or on() vector(0)" +) +JENKINS_CLEANUP_RUNS_RANGE = ( + "sum by (mode, status) (increase(ariadne_jenkins_workspace_cleanup_runs_total[$__range]))" +) +JENKINS_CLEANUP_OBJECTS_RANGE = ( + "sum by (kind, action, mode) (increase(ariadne_jenkins_workspace_cleanup_objects_total[$__range]))" +) +JENKINS_CLEANUP_LAST_RUN_AGE_HOURS = ( + "((time() - ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds) / 3600) or on() vector(999)" +) +JENKINS_CLEANUP_LAST_SUCCESS_AGE_HOURS = ( + "((time() - ariadne_jenkins_workspace_cleanup_last_success_timestamp_seconds) / 3600) or on() vector(999)" +) +JENKINS_CLEANUP_LAST_DELETED = "ariadne_jenkins_workspace_cleanup_last_deleted_total or on() vector(0)" +JENKINS_CLEANUP_LAST_PLANNED = "ariadne_jenkins_workspace_cleanup_last_planned_total or on() vector(0)" +JENKINS_WORKSPACE_PV_STALE_COUNT = ( + 'sum((kube_persistentvolume_status_phase{phase=~"Released|Failed"} > bool 0) ' + '* on(persistentvolume) group_left(claim_namespace,name) ' + 'kube_persistentvolume_claim_ref{claim_namespace="jenkins",name=~"pvc-workspace-.*"}) or on() vector(0)' +) +JENKINS_WORKSPACE_PV_STALE_AGE_HOURS = ( + '((time() - kube_persistentvolume_created) / 3600) ' + '* on(persistentvolume) group_left(claim_namespace,name) ' + 'kube_persistentvolume_claim_ref{claim_namespace="jenkins",name=~"pvc-workspace-.*"} ' + '* on(persistentvolume) group_left() (kube_persistentvolume_status_phase{phase=~"Released|Failed"} > bool 0)' +) ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" PLATFORM_TEST_SUITE_NAMES = [ "ariadne", @@ -1603,7 +1649,7 @@ def build_overview(): legend_display="table", legend_placement="right", legend_calcs=["lastNotNull"], - links=link_to("atlas-jobs"), + links=link_to("atlas-testing"), ) test_success["fieldConfig"]["defaults"]["min"] = 0 test_success["fieldConfig"]["defaults"]["max"] = 100 @@ -2903,6 +2949,15 @@ def build_jobs_dashboard(): {"color": "red", "value": 48}, ], } + old_age_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 24}, + {"color": "orange", "value": 72}, + {"color": "red", "value": 168}, + ], + } recent_error_thresholds = { "mode": "absolute", "steps": [ @@ -2912,7 +2967,6 @@ def build_jobs_dashboard(): {"color": "green", "value": 24}, ], } - task_error_thresholds = { "mode": "absolute", "steps": [ @@ -2922,6 +2976,15 @@ def build_jobs_dashboard(): {"color": "red", "value": 5}, ], } + count_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 2}, + {"color": "red", "value": 3}, + ], + } schedule_status_thresholds = { "mode": "absolute", "steps": [ @@ -2932,24 +2995,162 @@ def build_jobs_dashboard(): } panels.append( - bargauge_panel( + stat_panel( 1, - "Ariadne Task Errors (range)", - ARIADNE_TASK_ERRORS_RANGE, - {"h": 7, "w": 8, "x": 0, "y": 0}, + "Schedule Metrics Exported", + ARIADNE_SCHEDULE_SIGNAL_COUNT, + {"h": 4, "w": 4, "x": 0, "y": 0}, unit="none", instant=True, - legend="{{task}}", - thresholds=task_error_thresholds, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "green", "value": 1}, + ], + }, ) ) panels.append( + stat_panel( + 2, + "Schedule Tasks Stale (>36h)", + ARIADNE_SCHEDULE_STALE_COUNT, + {"h": 4, "w": 4, "x": 4, "y": 0}, + unit="none", + thresholds=count_thresholds, + ) + ) + panels.append( + stat_panel( + 3, + "Schedule Tasks Missing Success", + ARIADNE_SCHEDULE_MISSING_SUCCESS_COUNT, + {"h": 4, "w": 4, "x": 8, "y": 0}, + unit="none", + thresholds=count_thresholds, + ) + ) + panels.append( + stat_panel( + 4, + "Schedule Tasks Failed Last Run", + ARIADNE_SCHEDULE_FAILED_LAST_COUNT, + {"h": 4, "w": 4, "x": 12, "y": 0}, + unit="none", + thresholds=count_thresholds, + ) + ) + panels.append( + stat_panel( + 5, + "Glue Jobs Stale (>36h)", + GLUE_STALE_COUNT, + {"h": 4, "w": 4, "x": 16, "y": 0}, + unit="none", + thresholds=count_thresholds, + ) + ) + panels.append( + stat_panel( + 6, + "Jenkins Workspace PV Backlog", + JENKINS_WORKSPACE_PV_STALE_COUNT, + {"h": 4, "w": 4, "x": 20, "y": 0}, + unit="none", + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 10}, + {"color": "red", "value": 25}, + ], + }, + ) + ) + schedule_list_panel = table_panel( + 7, + "Ariadne Schedules: Last Success (h, newest first)", + ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS, + {"h": 8, "w": 12, "x": 0, "y": 4}, + unit="h", + transformations=[ + {"id": "labelsToFields", "options": {}}, + {"id": "sortBy", "options": {"fields": ["Value"], "order": "asc"}}, + ], + instant=True, + ) + schedule_list_panel["description"] = "Primary schedule inventory ordered by recency so fresh jobs stay at the top." + panels.append(schedule_list_panel) + panels.append( + bargauge_panel( + 8, + "Ariadne Schedule Last Error (hours ago)", + ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS, + {"h": 8, "w": 12, "x": 12, "y": 4}, + unit="h", + instant=True, + legend="{{task}}", + sort_order="asc", + thresholds=recent_error_thresholds, + decimals=2, + ) + ) + status_panel = bargauge_panel( + 9, + "Ariadne Schedule Last Status", + ARIADNE_SCHEDULE_LAST_STATUS, + {"h": 8, "w": 8, "x": 0, "y": 12}, + unit="none", + instant=True, + legend="{{task}}", + sort_order="asc", + thresholds=schedule_status_thresholds, + decimals=0, + ) + status_panel["description"] = "1 means the last run was ok. 0 means the last run ended in error." + status_panel["fieldConfig"]["defaults"]["mappings"] = [ { - "id": 2, + "type": "value", + "options": { + "0": {"text": "error"}, + "1": {"text": "ok"}, + }, + } + ] + panels.append(status_panel) + schedule_runs_panel = bargauge_panel( + 10, + "Ariadne Schedule Runs (range)", + ARIADNE_SCHEDULE_RUNS_RANGE, + {"h": 8, "w": 8, "x": 8, "y": 12}, + unit="none", + instant=True, + legend="{{task}}", + thresholds={"mode": "absolute", "steps": [{"color": "green", "value": None}]}, + ) + schedule_runs_panel["description"] = "Number of runs by schedule task over the selected dashboard time range." + panels.append(schedule_runs_panel) + schedule_errors_panel = bargauge_panel( + 11, + "Ariadne Schedule Errors (range)", + ARIADNE_SCHEDULE_ERRORS_RANGE, + {"h": 8, "w": 8, "x": 16, "y": 12}, + unit="none", + instant=True, + legend="{{task}}", + thresholds=task_error_thresholds, + ) + schedule_errors_panel["description"] = "Error run count by schedule task over the selected dashboard time range." + panels.append(schedule_errors_panel) + panels.append( + { + "id": 12, "type": "timeseries", "title": "Ariadne Attempts / Failures", "datasource": PROM_DS, - "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0}, + "gridPos": {"h": 7, "w": 12, "x": 0, "y": 20}, "targets": [ {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"}, @@ -2979,10 +3180,10 @@ def build_jobs_dashboard(): ) panels.append( bargauge_panel( - 3, + 13, "One-off Job Pods (age hours)", ONEOFF_JOB_POD_AGE_HOURS, - {"h": 7, "w": 8, "x": 16, "y": 0}, + {"h": 7, "w": 12, "x": 12, "y": 20}, unit="h", instant=True, legend="{{namespace}}/{{pod}}", @@ -2991,186 +3192,251 @@ def build_jobs_dashboard(): decimals=2, ) ) + panels.append( + bargauge_panel( + 14, + "Glue Jobs Last Success (hours ago)", + GLUE_LAST_SUCCESS_RANGE_HOURS, + {"h": 6, "w": 12, "x": 0, "y": 27}, + unit="h", + instant=True, + legend="{{namespace}}/{{cronjob}}", + thresholds=age_thresholds, + decimals=2, + ) + ) + panels.append( + bargauge_panel( + 15, + "Glue Jobs Last Schedule (hours ago)", + GLUE_LAST_SCHEDULE_RANGE_HOURS, + {"h": 6, "w": 12, "x": 12, "y": 27}, + unit="h", + instant=True, + legend="{{namespace}}/{{cronjob}}", + thresholds=age_thresholds, + decimals=2, + ) + ) panels.append( stat_panel( - 4, - "Glue Jobs Stale (>36h)", - GLUE_STALE_COUNT, - {"h": 4, "w": 4, "x": 0, "y": 7}, + 16, + "Jenkins Cleanup Signal Present", + JENKINS_CLEANUP_SIGNAL_COUNT, + {"h": 4, "w": 4, "x": 0, "y": 33}, unit="none", + instant=True, thresholds={ "mode": "absolute", "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 1}, - {"color": "orange", "value": 2}, - {"color": "red", "value": 3}, + {"color": "red", "value": None}, + {"color": "green", "value": 1}, ], }, ) ) panels.append( stat_panel( - 5, - "Glue Jobs Missing Success", - GLUE_MISSING_COUNT, - {"h": 4, "w": 4, "x": 4, "y": 7}, - unit="none", - ) - ) - panels.append( - stat_panel( - 6, - "Glue Jobs Suspended", - GLUE_SUSPENDED_COUNT, - {"h": 4, "w": 4, "x": 8, "y": 7}, - unit="none", - ) - ) - panels.append( - stat_panel( - 7, - "Ariadne Task Errors (1h)", - ARIADNE_TASK_ERRORS_1H_TOTAL, - {"h": 4, "w": 4, "x": 12, "y": 7}, - unit="none", - ) - ) - panels.append( - stat_panel( - 8, - "Ariadne Task Errors (24h)", - ARIADNE_TASK_ERRORS_24H_TOTAL, - {"h": 4, "w": 4, "x": 16, "y": 7}, - unit="none", - ) - ) - panels.append( - stat_panel( - 9, - "Ariadne Task Runs (1h)", - ARIADNE_TASK_RUNS_1H_TOTAL, - {"h": 4, "w": 4, "x": 20, "y": 7}, - unit="none", - ) - ) - panels.append( - bargauge_panel( - 10, - "Ariadne Schedule Last Error (hours ago)", - ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS, - {"h": 6, "w": 12, "x": 0, "y": 17}, + 17, + "Jenkins Cleanup Last Run Age (h)", + JENKINS_CLEANUP_LAST_RUN_AGE_HOURS, + {"h": 4, "w": 4, "x": 4, "y": 33}, unit="h", - instant=True, - legend="{{task}}", - thresholds=recent_error_thresholds, decimals=2, - ) - ) - panels.append( - bargauge_panel( - 11, - "Ariadne Schedule Last Success (hours ago, newest first)", - ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS, - {"h": 6, "w": 12, "x": 12, "y": 17}, - unit="h", instant=True, - legend="{{task}}", - sort_order="asc", thresholds=age_thresholds, - decimals=2, ) ) panels.append( - bargauge_panel( - 12, - "Glue Jobs Last Success (hours ago)", - GLUE_LAST_SUCCESS_RANGE_HOURS, - {"h": 6, "w": 12, "x": 0, "y": 23}, + stat_panel( + 18, + "Jenkins Cleanup Last Success Age (h)", + JENKINS_CLEANUP_LAST_SUCCESS_AGE_HOURS, + {"h": 4, "w": 4, "x": 8, "y": 33}, unit="h", - instant=True, - legend="{{namespace}}/{{cronjob}}", - thresholds=age_thresholds, decimals=2, + instant=True, + thresholds=age_thresholds, ) ) panels.append( - bargauge_panel( - 13, - "Glue Jobs Last Schedule (hours ago)", - GLUE_LAST_SCHEDULE_RANGE_HOURS, - {"h": 6, "w": 12, "x": 12, "y": 23}, - unit="h", - instant=True, - legend="{{namespace}}/{{cronjob}}", - thresholds=age_thresholds, - decimals=2, - ) - ) - panels.append( - bargauge_panel( - 14, - "Ariadne Task Errors (1h)", - ARIADNE_TASK_ERRORS_1H, - {"h": 6, "w": 12, "x": 0, "y": 29}, + stat_panel( + 19, + "Jenkins Cleanup Planned (last run)", + JENKINS_CLEANUP_LAST_PLANNED, + {"h": 4, "w": 4, "x": 12, "y": 33}, unit="none", instant=True, - legend="{{task}}", - thresholds=task_error_thresholds, ) ) panels.append( - bargauge_panel( - 15, - "Ariadne Task Errors (30d)", - ARIADNE_TASK_ERRORS_30D, - {"h": 6, "w": 12, "x": 12, "y": 29}, + stat_panel( + 20, + "Jenkins Cleanup Deleted (last run)", + JENKINS_CLEANUP_LAST_DELETED, + {"h": 4, "w": 4, "x": 16, "y": 33}, unit="none", instant=True, - legend="{{task}}", - thresholds=task_error_thresholds, ) ) panels.append( - bargauge_panel( - 16, + stat_panel( + 21, "Ariadne Access Requests", ARIADNE_ACCESS_REQUESTS, - {"h": 6, "w": 8, "x": 0, "y": 11}, + {"h": 4, "w": 4, "x": 20, "y": 33}, unit="none", instant=True, - legend="{{status}}", ) ) - coverage_panel = stat_panel( - 17, + panels.append( + timeseries_panel( + 22, + "Jenkins Cleanup Runs (range)", + None, + {"h": 7, "w": 12, "x": 0, "y": 37}, + unit="none", + targets=[ + {"refId": "A", "expr": JENKINS_CLEANUP_RUNS_RANGE, "legendFormat": "{{mode}}/{{status}}"}, + ], + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 23, + "Jenkins Cleanup Objects (range)", + None, + {"h": 7, "w": 12, "x": 12, "y": 37}, + unit="none", + targets=[ + {"refId": "A", "expr": JENKINS_CLEANUP_OBJECTS_RANGE, "legendFormat": "{{kind}}/{{action}}/{{mode}}"}, + ], + legend_display="table", + legend_placement="right", + ) + ) + stale_volume_panel = bargauge_panel( + 24, + "Jenkins Workspace PV Age (h, detached only)", + JENKINS_WORKSPACE_PV_STALE_AGE_HOURS, + {"h": 10, "w": 24, "x": 0, "y": 44}, + unit="h", + instant=True, + legend="{{name}} -> {{persistentvolume}}", + thresholds=old_age_thresholds, + decimals=1, + limit=40, + ) + stale_volume_panel["description"] = ( + "Oldest detached Jenkins workspace volumes first. This is the direct cleanup backlog view." + ) + panels.append(stale_volume_panel) + + return { + "uid": "atlas-jobs", + "title": "Atlas Jobs", + "folderUid": PRIVATE_FOLDER, + "editable": True, + "panels": panels, + "time": {"from": "now-7d", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "jobs", "glue"], + } + + +def build_testing_dashboard(): + panels = [] + pass_rate_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 80}, + {"color": "yellow", "value": 95}, + {"color": "green", "value": 99}, + ], + } + failures_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 3}, + {"color": "red", "value": 5}, + ], + } + + pass_rate_panel = stat_panel( + 1, "Platform Test Success Rate (30d)", TEST_SUCCESS_RATE, - {"h": 6, "w": 4, "x": 8, "y": 11}, + {"h": 4, "w": 6, "x": 0, "y": 0}, unit="percent", decimals=2, instant=True, + thresholds=pass_rate_thresholds, ) - coverage_panel["description"] = "Internal rollup across Ariadne task runs and Metis build/flash outcomes." - panels.append(coverage_panel) - tests_panel = table_panel( - 18, + pass_rate_panel["description"] = "Overall success rate across tracked suites over the last 30 days." + panels.append(pass_rate_panel) + failures_panel = stat_panel( + 2, + "Platform Test Failures (24h)", + TEST_FAILURES_24H_TOTAL, + {"h": 4, "w": 6, "x": 6, "y": 0}, + unit="none", + instant=True, + thresholds=failures_thresholds, + ) + failures_panel["description"] = "Total failed runs in the last 24 hours." + panels.append(failures_panel) + activity_panel = table_panel( + 3, "Platform Test Activity (30d)", PLATFORM_TEST_ACTIVITY_30D, - {"h": 6, "w": 12, "x": 12, "y": 11}, + {"h": 8, "w": 12, "x": 12, "y": 0}, unit="none", - transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], + transformations=[ + {"id": "labelsToFields", "options": {}}, + {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}, + ], instant=True, ) - tests_panel["description"] = ( - "Atlas Overview test panels depend on this internal activity table sourced from Ariadne and Metis counters." + activity_panel["description"] = "Suite/status event counts over 30 days." + panels.append(activity_panel) + panels.append( + bargauge_panel( + 4, + "Platform Test Failures by Suite (24h)", + PLATFORM_TEST_FAILURES_24H_BY_SUITE, + {"h": 8, "w": 12, "x": 0, "y": 8}, + unit="none", + instant=True, + legend="{{suite}}", + thresholds=failures_thresholds, + ) + ) + panels.append( + bargauge_panel( + 5, + "Platform Test Success Rate by Suite (24h, lowest first)", + PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE, + {"h": 8, "w": 12, "x": 12, "y": 8}, + unit="percent", + instant=True, + legend="{{suite}}", + sort_order="asc", + thresholds=pass_rate_thresholds, + decimals=2, + ) ) - panels.append(tests_panel) suite_panel = timeseries_panel( - 19, + 6, "Platform Test Success Rate by Suite", None, - {"h": 6, "w": 16, "x": 8, "y": 17}, + {"h": 8, "w": 24, "x": 0, "y": 16}, unit="percent", targets=PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS, legend_display="list", @@ -3187,69 +3453,20 @@ def build_jobs_dashboard(): "pointSize": 4, "spanNulls": True, } - suite_panel["description"] = ( - "Per-run interval pass points (0-100) per suite. Existing suites: ariadne, metis, ananke; additional suites appear automatically when platform_quality_gate_runs_total is published." - ) + suite_panel["description"] = "Trend line per suite. Flat gaps mean no runs in that interval." panels.append(suite_panel) - status_panel = bargauge_panel( - 20, - "Ariadne Schedule Last Status", - ARIADNE_SCHEDULE_LAST_STATUS, - {"h": 8, "w": 8, "x": 0, "y": 35}, - unit="none", - instant=True, - legend="{{task}}", - sort_order="asc", - thresholds=schedule_status_thresholds, - decimals=0, - ) - status_panel["description"] = "1 means the last run was ok. 0 means the last run ended in error." - status_panel["fieldConfig"]["defaults"]["mappings"] = [ - { - "type": "value", - "options": { - "0": {"text": "error"}, - "1": {"text": "ok"}, - }, - } - ] - panels.append(status_panel) - schedule_runs_panel = bargauge_panel( - 21, - "Ariadne Schedule Runs (range)", - ARIADNE_SCHEDULE_RUNS_RANGE, - {"h": 8, "w": 8, "x": 8, "y": 35}, - unit="none", - instant=True, - legend="{{task}}", - thresholds={"mode": "absolute", "steps": [{"color": "green", "value": None}]}, - ) - schedule_runs_panel["description"] = "Number of runs by schedule task over the selected dashboard time range." - panels.append(schedule_runs_panel) - schedule_errors_panel = bargauge_panel( - 22, - "Ariadne Schedule Errors (range)", - ARIADNE_SCHEDULE_ERRORS_RANGE, - {"h": 8, "w": 8, "x": 16, "y": 35}, - unit="none", - instant=True, - legend="{{task}}", - thresholds=task_error_thresholds, - ) - schedule_errors_panel["description"] = "Error run count by schedule task over the selected dashboard time range." - panels.append(schedule_errors_panel) return { - "uid": "atlas-jobs", - "title": "Atlas Jobs", + "uid": "atlas-testing", + "title": "Atlas Testing", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, - "time": {"from": "now-7d", "to": "now"}, + "time": {"from": "now-30d", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", - "tags": ["atlas", "jobs", "glue"], + "tags": ["atlas", "testing", "quality"], } @@ -3529,6 +3746,10 @@ DASHBOARDS = { "builder": build_jobs_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml", }, + "atlas-testing": { + "builder": build_testing_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml", + }, "atlas-power": { "builder": build_power_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-power.yaml", diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 6db70077..79ed97e4 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -6,31 +6,91 @@ "panels": [ { "id": 1, - "type": "bargauge", - "title": "Ariadne Task Errors (range)", + "type": "stat", + "title": "Schedule Metrics Exported", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, - "w": 8, + "h": 4, + "w": 4, "x": 0, "y": 0 }, "targets": [ { - "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))", + "expr": "count(ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"}) or on() vector(0)", "refId": "A", - "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", - "min": 0, - "max": null, + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Schedule Tasks Stale (>36h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 0 + }, + "targets": [ + { + "expr": "sum(((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"}) > bool 129600)) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -44,140 +104,61 @@ }, { "color": "orange", - "value": 3 + "value": 2 }, { "color": "red", - "value": 5 + "value": 3 } ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" } }, "overrides": [] }, "options": { - "displayMode": "gradient", - "orientation": "horizontal", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + }, + "textMode": "value" + } }, { - "id": 2, - "type": "timeseries", - "title": "Ariadne Attempts / Failures", + "id": 3, + "type": "stat", + "title": "Schedule Tasks Missing Success", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, - "w": 8, + "h": 4, + "w": 4, "x": 8, "y": 0 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", - "refId": "A", - "legendFormat": "Attempts" - }, - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "B", - "legendFormat": "Failures" + "expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^schedule\\..+$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"})) or on() vector(0)", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "none" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Attempts" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "green" - } - } - ] + "color": { + "mode": "thresholds" }, - { - "matcher": { - "id": "byName", - "options": "Failures" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "red" - } - } - ] - } - ] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 3, - "type": "bargauge", - "title": "One-off Job Pods (age hours)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 16, - "y": 0 - }, - "targets": [ - { - "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", - "refId": "A", - "legendFormat": "{{namespace}}/{{pod}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "min": 0, - "max": null, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -187,54 +168,110 @@ }, { "color": "yellow", - "value": 6 + "value": 1 }, { "color": "orange", - "value": 24 + "value": 2 }, { "color": "red", - "value": 48 + "value": 3 } ] }, - "decimals": 2 + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, "options": { - "displayMode": "gradient", - "orientation": "horizontal", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } }, - { - "id": "limit", - "options": { - "limit": 12 - } - } - ] + "textMode": "value" + } }, { "id": 4, "type": "stat", + "title": "Schedule Tasks Failed Last Run", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^schedule\\..+$\"}) > bool 0)) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", "title": "Glue Jobs Stale (>36h)", "datasource": { "type": "prometheus", @@ -243,8 +280,8 @@ "gridPos": { "h": 4, "w": 4, - "x": 0, - "y": 7 + "x": 16, + "y": 0 }, "targets": [ { @@ -300,70 +337,10 @@ "textMode": "value" } }, - { - "id": 5, - "type": "stat", - "title": "Glue Jobs Missing Success", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 7 - }, - "targets": [ - { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, { "id": 6, "type": "stat", - "title": "Glue Jobs Suspended", + "title": "Jenkins Workspace PV Backlog", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -371,12 +348,12 @@ "gridPos": { "h": 4, "w": 4, - "x": 8, - "y": 7 + "x": 20, + "y": 0 }, "targets": [ { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)", + "expr": "sum((kube_persistentvolume_status_phase{phase=~\"Released|Failed\"} > bool 0) * on(persistentvolume) group_left(claim_namespace,name) kube_persistentvolume_claim_ref{claim_namespace=\"jenkins\",name=~\"pvc-workspace-.*\"}) or on() vector(0)", "refId": "A" } ], @@ -390,12 +367,20 @@ "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", + "color": "yellow", "value": 1 + }, + { + "color": "orange", + "value": 10 + }, + { + "color": "red", + "value": 25 } ] }, @@ -422,186 +407,57 @@ }, { "id": 7, - "type": "stat", - "title": "Ariadne Task Errors (1h)", + "type": "table", + "title": "Ariadne Schedules: Last Success (h, newest first)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 7 + "h": 8, + "w": 12, + "x": 0, + "y": 4 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", - "refId": "A" + "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600", + "refId": "A", + "instant": true } ], "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", + "unit": "h", "custom": { - "displayMode": "auto" + "filterable": true } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} }, - "textMode": "value" - } + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "asc" + } + } + ], + "description": "Primary schedule inventory ordered by recency so fresh jobs stay at the top." }, { "id": 8, - "type": "stat", - "title": "Ariadne Task Errors (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 16, - "y": 7 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 9, - "type": "stat", - "title": "Ariadne Task Runs (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 20, - "y": 7 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 10, "type": "bargauge", "title": "Ariadne Schedule Last Error (hours ago)", "datasource": { @@ -609,14 +465,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 8, "w": 12, - "x": 0, - "y": 17 + "x": 12, + "y": 4 }, "targets": [ { - "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600)", + "expr": "sort((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -663,81 +519,6 @@ "values": false } }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 11, - "type": "bargauge", - "title": "Ariadne Schedule Last Success (hours ago, newest first)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 17 - }, - "targets": [ - { - "expr": "sort((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600)", - "refId": "A", - "legendFormat": "{{task}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 6 - }, - { - "color": "orange", - "value": 24 - }, - { - "color": "red", - "value": 48 - } - ] - }, - "decimals": 2 - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, "transformations": [ { "id": "sortBy", @@ -751,592 +532,7 @@ ] }, { - "id": 12, - "type": "bargauge", - "title": "Glue Jobs Last Success (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 23 - }, - "targets": [ - { - "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", - "refId": "A", - "legendFormat": "{{namespace}}/{{cronjob}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 6 - }, - { - "color": "orange", - "value": 24 - }, - { - "color": "red", - "value": 48 - } - ] - }, - "decimals": 2 - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 13, - "type": "bargauge", - "title": "Glue Jobs Last Schedule (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 23 - }, - "targets": [ - { - "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", - "refId": "A", - "legendFormat": "{{namespace}}/{{cronjob}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 6 - }, - { - "color": "orange", - "value": 24 - }, - { - "color": "red", - "value": 48 - } - ] - }, - "decimals": 2 - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 14, - "type": "bargauge", - "title": "Ariadne Task Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 29 - }, - "targets": [ - { - "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))", - "refId": "A", - "legendFormat": "{{task}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 3 - }, - { - "color": "red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 15, - "type": "bargauge", - "title": "Ariadne Task Errors (30d)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 29 - }, - "targets": [ - { - "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))", - "refId": "A", - "legendFormat": "{{task}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 3 - }, - { - "color": "red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 16, - "type": "bargauge", - "title": "Ariadne Access Requests", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 11 - }, - "targets": [ - { - "expr": "sort_desc(ariadne_access_requests_total)", - "refId": "A", - "legendFormat": "{{status}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 70 - }, - { - "color": "red", - "value": 85 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 17, - "type": "stat", - "title": "Platform Test Success Rate (30d)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 4, - "x": 8, - "y": 11 - }, - "targets": [ - { - "expr": "100 * ((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[30d])) or on() vector(0))), 1)", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - }, - "decimals": 2 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "description": "Internal rollup across Ariadne task runs and Metis build/flash outcomes." - }, - { - "id": 18, - "type": "table", - "title": "Platform Test Activity (30d)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 11 - }, - "targets": [ - { - "expr": "sum by (suite, status) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[30d]))", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ], - "description": "Atlas Overview test panels depend on this internal activity table sourced from Ariadne and Metis counters." - }, - { - "id": 19, - "type": "timeseries", - "title": "Platform Test Success Rate by Suite", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 16, - "x": 8, - "y": 17 - }, - "targets": [ - { - "refId": "A", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\"}[1h]))) > 0)", - "legendFormat": "ariadne" - }, - { - "refId": "B", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"metis\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"metis\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"metis\"}[1h]))) > 0)", - "legendFormat": "metis" - }, - { - "refId": "C", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"ananke\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"ananke\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"ananke\"}[1h]))) > 0)", - "legendFormat": "ananke" - }, - { - "refId": "D", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[1h]))) > 0)", - "legendFormat": "atlasbot" - }, - { - "refId": "E", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[1h]))) > 0)", - "legendFormat": "lesavka" - }, - { - "refId": "F", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[1h]))) > 0)", - "legendFormat": "pegasus" - }, - { - "refId": "G", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[1h]))) > 0)", - "legendFormat": "soteria" - }, - { - "refId": "H", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[1h]))) > 0)", - "legendFormat": "titan-iac" - }, - { - "refId": "I", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[1h]))) > 0)", - "legendFormat": "bstein-home" - }, - { - "refId": "J", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[1h]))) > 0)", - "legendFormat": "arcanagon" - }, - { - "refId": "K", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\"}[1h]))) > 0)", - "legendFormat": "data-prepper" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "custom": { - "drawStyle": "line", - "lineInterpolation": "linear", - "lineWidth": 2, - "fillOpacity": 10, - "showPoints": "always", - "pointSize": 4, - "spanNulls": true - } - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "description": "Per-run interval pass points (0-100) per suite. Existing suites: ariadne, metis, ananke; additional suites appear automatically when platform_quality_gate_runs_total is published." - }, - { - "id": 20, + "id": 9, "type": "bargauge", "title": "Ariadne Schedule Last Status", "datasource": { @@ -1347,7 +543,7 @@ "h": 8, "w": 8, "x": 0, - "y": 35 + "y": 12 }, "targets": [ { @@ -1421,7 +617,7 @@ "description": "1 means the last run was ok. 0 means the last run ended in error." }, { - "id": 21, + "id": 10, "type": "bargauge", "title": "Ariadne Schedule Runs (range)", "datasource": { @@ -1432,7 +628,7 @@ "h": 8, "w": 8, "x": 8, - "y": 35 + "y": 12 }, "targets": [ { @@ -1484,7 +680,7 @@ "description": "Number of runs by schedule task over the selected dashboard time range." }, { - "id": 22, + "id": 11, "type": "bargauge", "title": "Ariadne Schedule Errors (range)", "datasource": { @@ -1495,7 +691,7 @@ "h": 8, "w": 8, "x": 16, - "y": 35 + "y": 12 }, "targets": [ { @@ -1557,6 +753,850 @@ } ], "description": "Error run count by schedule task over the selected dashboard time range." + }, + { + "id": 12, + "type": "timeseries", + "title": "Ariadne Attempts / Failures", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 20 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Attempts" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "green" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 13, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 20 + }, + "targets": [ + { + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 12 + } + } + ] + }, + { + "id": 14, + "type": "bargauge", + "title": "Glue Jobs Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 27 + }, + "targets": [ + { + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, + "type": "bargauge", + "title": "Glue Jobs Last Schedule (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 27 + }, + "targets": [ + { + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 16, + "type": "stat", + "title": "Jenkins Cleanup Signal Present", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 33 + }, + "targets": [ + { + "expr": "count(ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 17, + "type": "stat", + "title": "Jenkins Cleanup Last Run Age (h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 33 + }, + "targets": [ + { + "expr": "((time() - ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds) / 3600) or on() vector(999)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "unit": "h", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 18, + "type": "stat", + "title": "Jenkins Cleanup Last Success Age (h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 33 + }, + "targets": [ + { + "expr": "((time() - ariadne_jenkins_workspace_cleanup_last_success_timestamp_seconds) / 3600) or on() vector(999)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "unit": "h", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 19, + "type": "stat", + "title": "Jenkins Cleanup Planned (last run)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 33 + }, + "targets": [ + { + "expr": "ariadne_jenkins_workspace_cleanup_last_planned_total or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 20, + "type": "stat", + "title": "Jenkins Cleanup Deleted (last run)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 33 + }, + "targets": [ + { + "expr": "ariadne_jenkins_workspace_cleanup_last_deleted_total or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 21, + "type": "stat", + "title": "Ariadne Access Requests", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 33 + }, + "targets": [ + { + "expr": "ariadne_access_requests_total", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 22, + "type": "timeseries", + "title": "Jenkins Cleanup Runs (range)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 37 + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (mode, status) (increase(ariadne_jenkins_workspace_cleanup_runs_total[$__range]))", + "legendFormat": "{{mode}}/{{status}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 23, + "type": "timeseries", + "title": "Jenkins Cleanup Objects (range)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 37 + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (kind, action, mode) (increase(ariadne_jenkins_workspace_cleanup_objects_total[$__range]))", + "legendFormat": "{{kind}}/{{action}}/{{mode}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 24, + "type": "bargauge", + "title": "Jenkins Workspace PV Age (h, detached only)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 44 + }, + "targets": [ + { + "expr": "sort_desc(((time() - kube_persistentvolume_created) / 3600) * on(persistentvolume) group_left(claim_namespace,name) kube_persistentvolume_claim_ref{claim_namespace=\"jenkins\",name=~\"pvc-workspace-.*\"} * on(persistentvolume) group_left() (kube_persistentvolume_status_phase{phase=~\"Released|Failed\"} > bool 0))", + "refId": "A", + "legendFormat": "{{name}} -> {{persistentvolume}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 24 + }, + { + "color": "orange", + "value": 72 + }, + { + "color": "red", + "value": 168 + } + ] + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 40 + } + } + ], + "description": "Oldest detached Jenkins workspace volumes first. This is the direct cleanup backlog view." } ], "time": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 396ab361..6fa5e8a3 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1915,8 +1915,8 @@ }, "links": [ { - "title": "Open atlas-jobs dashboard", - "url": "/d/atlas-jobs", + "title": "Open atlas-testing dashboard", + "url": "/d/atlas-testing", "targetBlank": true } ], diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json new file mode 100644 index 00000000..1fc0f569 --- /dev/null +++ b/services/monitoring/dashboards/atlas-testing.json @@ -0,0 +1,462 @@ +{ + "uid": "atlas-testing", + "title": "Atlas Testing", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Platform Test Success Rate (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "100 * ((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[30d])) or on() vector(0))), 1)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "yellow", + "value": 95 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "description": "Overall success rate across tracked suites over the last 30 days." + }, + { + "id": 2, + "type": "stat", + "title": "Platform Test Failures (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "(sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status!~\"ok|passed|success\"}[24h])) or on() vector(0))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "description": "Total failed runs in the last 24 hours." + }, + { + "id": 3, + "type": "table", + "title": "Platform Test Activity (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum by (suite, status) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[30d]))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ], + "description": "Suite/status event counts over 30 days." + }, + { + "id": 4, + "type": "bargauge", + "title": "Platform Test Failures by Suite (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "targets": [ + { + "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status!~\"ok|passed|success\"}[24h])))", + "refId": "A", + "legendFormat": "{{suite}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 5, + "type": "bargauge", + "title": "Platform Test Success Rate by Suite (24h, lowest first)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "targets": [ + { + "expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))", + "refId": "A", + "legendFormat": "{{suite}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "yellow", + "value": 95 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "asc" + } + } + ] + }, + { + "id": 6, + "type": "timeseries", + "title": "Platform Test Success Rate by Suite", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "targets": [ + { + "refId": "A", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\"}[1h]))) > 0)", + "legendFormat": "ariadne" + }, + { + "refId": "B", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"metis\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"metis\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"metis\"}[1h]))) > 0)", + "legendFormat": "metis" + }, + { + "refId": "C", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"ananke\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"ananke\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"ananke\"}[1h]))) > 0)", + "legendFormat": "ananke" + }, + { + "refId": "D", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[1h]))) > 0)", + "legendFormat": "atlasbot" + }, + { + "refId": "E", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[1h]))) > 0)", + "legendFormat": "lesavka" + }, + { + "refId": "F", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[1h]))) > 0)", + "legendFormat": "pegasus" + }, + { + "refId": "G", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[1h]))) > 0)", + "legendFormat": "soteria" + }, + { + "refId": "H", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[1h]))) > 0)", + "legendFormat": "titan-iac" + }, + { + "refId": "I", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[1h]))) > 0)", + "legendFormat": "bstein-home" + }, + { + "refId": "J", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[1h]))) > 0)", + "legendFormat": "arcanagon" + }, + { + "refId": "K", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\"}[1h]))) > 0)", + "legendFormat": "data-prepper" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "fillOpacity": 10, + "showPoints": "always", + "pointSize": 4, + "spanNulls": true + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "Trend line per suite. Flat gaps mean no runs in that interval." + } + ], + "time": { + "from": "now-30d", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "testing", + "quality" + ] +} diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 451fe8c8..f4ffb16e 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -15,31 +15,91 @@ data: "panels": [ { "id": 1, - "type": "bargauge", - "title": "Ariadne Task Errors (range)", + "type": "stat", + "title": "Schedule Metrics Exported", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, - "w": 8, + "h": 4, + "w": 4, "x": 0, "y": 0 }, "targets": [ { - "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))", + "expr": "count(ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"}) or on() vector(0)", "refId": "A", - "legendFormat": "{{task}}", "instant": true } ], "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, "unit": "none", - "min": 0, - "max": null, + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Schedule Tasks Stale (>36h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 0 + }, + "targets": [ + { + "expr": "sum(((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"}) > bool 129600)) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -53,140 +113,61 @@ data: }, { "color": "orange", - "value": 3 + "value": 2 }, { "color": "red", - "value": 5 + "value": 3 } ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" } }, "overrides": [] }, "options": { - "displayMode": "gradient", - "orientation": "horizontal", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] + }, + "textMode": "value" + } }, { - "id": 2, - "type": "timeseries", - "title": "Ariadne Attempts / Failures", + "id": 3, + "type": "stat", + "title": "Schedule Tasks Missing Success", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, - "w": 8, + "h": 4, + "w": 4, "x": 8, "y": 0 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", - "refId": "A", - "legendFormat": "Attempts" - }, - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "B", - "legendFormat": "Failures" + "expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^schedule\\..+$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"})) or on() vector(0)", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "none" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Attempts" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "green" - } - } - ] + "color": { + "mode": "thresholds" }, - { - "matcher": { - "id": "byName", - "options": "Failures" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "red" - } - } - ] - } - ] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 3, - "type": "bargauge", - "title": "One-off Job Pods (age hours)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 16, - "y": 0 - }, - "targets": [ - { - "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", - "refId": "A", - "legendFormat": "{{namespace}}/{{pod}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "min": 0, - "max": null, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -196,54 +177,110 @@ data: }, { "color": "yellow", - "value": 6 + "value": 1 }, { "color": "orange", - "value": 24 + "value": 2 }, { "color": "red", - "value": 48 + "value": 3 } ] }, - "decimals": 2 + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, "options": { - "displayMode": "gradient", - "orientation": "horizontal", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } }, - { - "id": "limit", - "options": { - "limit": 12 - } - } - ] + "textMode": "value" + } }, { "id": 4, "type": "stat", + "title": "Schedule Tasks Failed Last Run", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^schedule\\..+$\"}) > bool 0)) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", "title": "Glue Jobs Stale (>36h)", "datasource": { "type": "prometheus", @@ -252,8 +289,8 @@ data: "gridPos": { "h": 4, "w": 4, - "x": 0, - "y": 7 + "x": 16, + "y": 0 }, "targets": [ { @@ -309,70 +346,10 @@ data: "textMode": "value" } }, - { - "id": 5, - "type": "stat", - "title": "Glue Jobs Missing Success", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 7 - }, - "targets": [ - { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, { "id": 6, "type": "stat", - "title": "Glue Jobs Suspended", + "title": "Jenkins Workspace PV Backlog", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -380,12 +357,12 @@ data: "gridPos": { "h": 4, "w": 4, - "x": 8, - "y": 7 + "x": 20, + "y": 0 }, "targets": [ { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)", + "expr": "sum((kube_persistentvolume_status_phase{phase=~\"Released|Failed\"} > bool 0) * on(persistentvolume) group_left(claim_namespace,name) kube_persistentvolume_claim_ref{claim_namespace=\"jenkins\",name=~\"pvc-workspace-.*\"}) or on() vector(0)", "refId": "A" } ], @@ -399,12 +376,20 @@ data: "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", + "color": "yellow", "value": 1 + }, + { + "color": "orange", + "value": 10 + }, + { + "color": "red", + "value": 25 } ] }, @@ -431,186 +416,57 @@ data: }, { "id": 7, - "type": "stat", - "title": "Ariadne Task Errors (1h)", + "type": "table", + "title": "Ariadne Schedules: Last Success (h, newest first)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 7 + "h": 8, + "w": 12, + "x": 0, + "y": 4 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", - "refId": "A" + "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600", + "refId": "A", + "instant": true } ], "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", + "unit": "h", "custom": { - "displayMode": "auto" + "filterable": true } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} }, - "textMode": "value" - } + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "asc" + } + } + ], + "description": "Primary schedule inventory ordered by recency so fresh jobs stay at the top." }, { "id": 8, - "type": "stat", - "title": "Ariadne Task Errors (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 16, - "y": 7 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[24h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 9, - "type": "stat", - "title": "Ariadne Task Runs (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 20, - "y": 7 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 10, "type": "bargauge", "title": "Ariadne Schedule Last Error (hours ago)", "datasource": { @@ -618,14 +474,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, + "h": 8, "w": 12, - "x": 0, - "y": 17 + "x": 12, + "y": 4 }, "targets": [ { - "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600)", + "expr": "sort((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -672,81 +528,6 @@ data: "values": false } }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 11, - "type": "bargauge", - "title": "Ariadne Schedule Last Success (hours ago, newest first)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 17 - }, - "targets": [ - { - "expr": "sort((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600)", - "refId": "A", - "legendFormat": "{{task}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 6 - }, - { - "color": "orange", - "value": 24 - }, - { - "color": "red", - "value": 48 - } - ] - }, - "decimals": 2 - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, "transformations": [ { "id": "sortBy", @@ -760,592 +541,7 @@ data: ] }, { - "id": 12, - "type": "bargauge", - "title": "Glue Jobs Last Success (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 23 - }, - "targets": [ - { - "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", - "refId": "A", - "legendFormat": "{{namespace}}/{{cronjob}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 6 - }, - { - "color": "orange", - "value": 24 - }, - { - "color": "red", - "value": 48 - } - ] - }, - "decimals": 2 - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 13, - "type": "bargauge", - "title": "Glue Jobs Last Schedule (hours ago)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 23 - }, - "targets": [ - { - "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", - "refId": "A", - "legendFormat": "{{namespace}}/{{cronjob}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 6 - }, - { - "color": "orange", - "value": 24 - }, - { - "color": "red", - "value": 48 - } - ] - }, - "decimals": 2 - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 14, - "type": "bargauge", - "title": "Ariadne Task Errors (1h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 29 - }, - "targets": [ - { - "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))", - "refId": "A", - "legendFormat": "{{task}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 3 - }, - { - "color": "red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 15, - "type": "bargauge", - "title": "Ariadne Task Errors (30d)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 29 - }, - "targets": [ - { - "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))", - "refId": "A", - "legendFormat": "{{task}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 3 - }, - { - "color": "red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 16, - "type": "bargauge", - "title": "Ariadne Access Requests", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 11 - }, - "targets": [ - { - "expr": "sort_desc(ariadne_access_requests_total)", - "refId": "A", - "legendFormat": "{{status}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 70 - }, - { - "color": "red", - "value": 85 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ] - }, - { - "id": 17, - "type": "stat", - "title": "Platform Test Success Rate (30d)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 4, - "x": 8, - "y": 11 - }, - "targets": [ - { - "expr": "100 * ((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[30d])) or on() vector(0))), 1)", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - }, - "decimals": 2 - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "description": "Internal rollup across Ariadne task runs and Metis build/flash outcomes." - }, - { - "id": 18, - "type": "table", - "title": "Platform Test Activity (30d)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 11 - }, - "targets": [ - { - "expr": "sum by (suite, status) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[30d]))", - "refId": "A", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "custom": { - "filterable": true - } - }, - "overrides": [] - }, - "options": { - "showHeader": true, - "columnFilters": false - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ], - "description": "Atlas Overview test panels depend on this internal activity table sourced from Ariadne and Metis counters." - }, - { - "id": 19, - "type": "timeseries", - "title": "Platform Test Success Rate by Suite", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 16, - "x": 8, - "y": 17 - }, - "targets": [ - { - "refId": "A", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\"}[1h]))) > 0)", - "legendFormat": "ariadne" - }, - { - "refId": "B", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"metis\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"metis\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"metis\"}[1h]))) > 0)", - "legendFormat": "metis" - }, - { - "refId": "C", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"ananke\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"ananke\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"ananke\"}[1h]))) > 0)", - "legendFormat": "ananke" - }, - { - "refId": "D", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[1h]))) > 0)", - "legendFormat": "atlasbot" - }, - { - "refId": "E", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[1h]))) > 0)", - "legendFormat": "lesavka" - }, - { - "refId": "F", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[1h]))) > 0)", - "legendFormat": "pegasus" - }, - { - "refId": "G", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[1h]))) > 0)", - "legendFormat": "soteria" - }, - { - "refId": "H", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[1h]))) > 0)", - "legendFormat": "titan-iac" - }, - { - "refId": "I", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[1h]))) > 0)", - "legendFormat": "bstein-home" - }, - { - "refId": "J", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[1h]))) > 0)", - "legendFormat": "arcanagon" - }, - { - "refId": "K", - "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\"}[1h]))) > 0)", - "legendFormat": "data-prepper" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "custom": { - "drawStyle": "line", - "lineInterpolation": "linear", - "lineWidth": 2, - "fillOpacity": 10, - "showPoints": "always", - "pointSize": 4, - "spanNulls": true - } - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "description": "Per-run interval pass points (0-100) per suite. Existing suites: ariadne, metis, ananke; additional suites appear automatically when platform_quality_gate_runs_total is published." - }, - { - "id": 20, + "id": 9, "type": "bargauge", "title": "Ariadne Schedule Last Status", "datasource": { @@ -1356,7 +552,7 @@ data: "h": 8, "w": 8, "x": 0, - "y": 35 + "y": 12 }, "targets": [ { @@ -1430,7 +626,7 @@ data: "description": "1 means the last run was ok. 0 means the last run ended in error." }, { - "id": 21, + "id": 10, "type": "bargauge", "title": "Ariadne Schedule Runs (range)", "datasource": { @@ -1441,7 +637,7 @@ data: "h": 8, "w": 8, "x": 8, - "y": 35 + "y": 12 }, "targets": [ { @@ -1493,7 +689,7 @@ data: "description": "Number of runs by schedule task over the selected dashboard time range." }, { - "id": 22, + "id": 11, "type": "bargauge", "title": "Ariadne Schedule Errors (range)", "datasource": { @@ -1504,7 +700,7 @@ data: "h": 8, "w": 8, "x": 16, - "y": 35 + "y": 12 }, "targets": [ { @@ -1566,6 +762,850 @@ data: } ], "description": "Error run count by schedule task over the selected dashboard time range." + }, + { + "id": 12, + "type": "timeseries", + "title": "Ariadne Attempts / Failures", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 20 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", + "refId": "A", + "legendFormat": "Attempts" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "B", + "legendFormat": "Failures" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Attempts" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "green" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 13, + "type": "bargauge", + "title": "One-off Job Pods (age hours)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 20 + }, + "targets": [ + { + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 12 + } + } + ] + }, + { + "id": 14, + "type": "bargauge", + "title": "Glue Jobs Last Success (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 27 + }, + "targets": [ + { + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 15, + "type": "bargauge", + "title": "Glue Jobs Last Schedule (hours ago)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 27 + }, + "targets": [ + { + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", + "refId": "A", + "legendFormat": "{{namespace}}/{{cronjob}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 16, + "type": "stat", + "title": "Jenkins Cleanup Signal Present", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 33 + }, + "targets": [ + { + "expr": "count(ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 17, + "type": "stat", + "title": "Jenkins Cleanup Last Run Age (h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 33 + }, + "targets": [ + { + "expr": "((time() - ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds) / 3600) or on() vector(999)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "unit": "h", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 18, + "type": "stat", + "title": "Jenkins Cleanup Last Success Age (h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 33 + }, + "targets": [ + { + "expr": "((time() - ariadne_jenkins_workspace_cleanup_last_success_timestamp_seconds) / 3600) or on() vector(999)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 6 + }, + { + "color": "orange", + "value": 24 + }, + { + "color": "red", + "value": 48 + } + ] + }, + "unit": "h", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 19, + "type": "stat", + "title": "Jenkins Cleanup Planned (last run)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 33 + }, + "targets": [ + { + "expr": "ariadne_jenkins_workspace_cleanup_last_planned_total or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 20, + "type": "stat", + "title": "Jenkins Cleanup Deleted (last run)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 33 + }, + "targets": [ + { + "expr": "ariadne_jenkins_workspace_cleanup_last_deleted_total or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 21, + "type": "stat", + "title": "Ariadne Access Requests", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 33 + }, + "targets": [ + { + "expr": "ariadne_access_requests_total", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 22, + "type": "timeseries", + "title": "Jenkins Cleanup Runs (range)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 37 + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (mode, status) (increase(ariadne_jenkins_workspace_cleanup_runs_total[$__range]))", + "legendFormat": "{{mode}}/{{status}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 23, + "type": "timeseries", + "title": "Jenkins Cleanup Objects (range)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 37 + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (kind, action, mode) (increase(ariadne_jenkins_workspace_cleanup_objects_total[$__range]))", + "legendFormat": "{{kind}}/{{action}}/{{mode}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 24, + "type": "bargauge", + "title": "Jenkins Workspace PV Age (h, detached only)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 44 + }, + "targets": [ + { + "expr": "sort_desc(((time() - kube_persistentvolume_created) / 3600) * on(persistentvolume) group_left(claim_namespace,name) kube_persistentvolume_claim_ref{claim_namespace=\"jenkins\",name=~\"pvc-workspace-.*\"} * on(persistentvolume) group_left() (kube_persistentvolume_status_phase{phase=~\"Released|Failed\"} > bool 0))", + "refId": "A", + "legendFormat": "{{name}} -> {{persistentvolume}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "h", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 24 + }, + { + "color": "orange", + "value": 72 + }, + { + "color": "red", + "value": 168 + } + ] + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + }, + { + "id": "limit", + "options": { + "limit": 40 + } + } + ], + "description": "Oldest detached Jenkins workspace volumes first. This is the direct cleanup backlog view." } ], "time": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 347644ca..e5c0ca04 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1924,8 +1924,8 @@ data: }, "links": [ { - "title": "Open atlas-jobs dashboard", - "url": "/d/atlas-jobs", + "title": "Open atlas-testing dashboard", + "url": "/d/atlas-testing", "targetBlank": true } ], diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml new file mode 100644 index 00000000..7aa06032 --- /dev/null +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -0,0 +1,471 @@ +# services/monitoring/grafana-dashboard-testing.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-testing + labels: + grafana_dashboard: "1" +data: + atlas-testing.json: | + { + "uid": "atlas-testing", + "title": "Atlas Testing", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Platform Test Success Rate (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "100 * ((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[30d])) or on() vector(0))) / clamp_min(((sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[30d])) or on() vector(0))), 1)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "yellow", + "value": 95 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "description": "Overall success rate across tracked suites over the last 30 days." + }, + { + "id": 2, + "type": "stat", + "title": "Platform Test Failures (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "(sum(increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status!~\"ok|passed|success\"}[24h])) or on() vector(0))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "description": "Total failed runs in the last 24 hours." + }, + { + "id": 3, + "type": "table", + "title": "Platform Test Activity (30d)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum by (suite, status) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[30d]))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "custom": { + "filterable": true + } + }, + "overrides": [] + }, + "options": { + "showHeader": true, + "columnFilters": false + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ], + "description": "Suite/status event counts over 30 days." + }, + { + "id": 4, + "type": "bargauge", + "title": "Platform Test Failures by Suite (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "targets": [ + { + "expr": "sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status!~\"ok|passed|success\"}[24h])))", + "refId": "A", + "legendFormat": "{{suite}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "none", + "min": 0, + "max": null, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "desc" + } + } + ] + }, + { + "id": 5, + "type": "bargauge", + "title": "Platform Test Success Rate by Suite (24h, lowest first)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "targets": [ + { + "expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))", + "refId": "A", + "legendFormat": "{{suite}}", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "yellow", + "value": 95 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": [ + "Value" + ], + "order": "asc" + } + } + ] + }, + { + "id": 6, + "type": "timeseries", + "title": "Platform Test Success Rate by Suite", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "targets": [ + { + "refId": "A", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"ariadne\"}[1h]))) > 0)", + "legendFormat": "ariadne" + }, + { + "refId": "B", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"metis\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"metis\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"metis\"}[1h]))) > 0)", + "legendFormat": "metis" + }, + { + "refId": "C", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"ananke\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"ananke\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"ananke\"}[1h]))) > 0)", + "legendFormat": "ananke" + }, + { + "refId": "D", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"atlasbot\"}[1h]))) > 0)", + "legendFormat": "atlasbot" + }, + { + "refId": "E", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"lesavka\"}[1h]))) > 0)", + "legendFormat": "lesavka" + }, + { + "refId": "F", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"pegasus\"}[1h]))) > 0)", + "legendFormat": "pegasus" + }, + { + "refId": "G", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"soteria\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"soteria\"}[1h]))) > 0)", + "legendFormat": "soteria" + }, + { + "refId": "H", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"titan-iac\"}[1h]))) > 0)", + "legendFormat": "titan-iac" + }, + { + "refId": "I", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"bstein-home\"}[1h]))) > 0)", + "legendFormat": "bstein-home" + }, + { + "refId": "J", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"arcanagon\"}[1h]))) > 0)", + "legendFormat": "arcanagon" + }, + { + "refId": "K", + "expr": "(100 * (sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\",status=~\"ok|passed|success\"}[1h]))) / clamp_min((sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\"}[1h]))), 1)) and on() ((sum(increase(platform_quality_gate_runs_total{suite=\"data-prepper\"}[1h]))) > 0)", + "legendFormat": "data-prepper" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "fillOpacity": 10, + "showPoints": "always", + "pointSize": 4, + "spanNulls": true + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "Trend line per suite. Flat gaps mean no runs in that interval." + } + ], + "time": { + "from": "now-30d", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "testing", + "quality" + ] + } diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 67580f60..f6c7ab9d 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -16,6 +16,7 @@ resources: - grafana-dashboard-power.yaml - grafana-dashboard-mail.yaml - grafana-dashboard-jobs.yaml + - grafana-dashboard-testing.yaml - dcgm-exporter.yaml - jetson-tegrastats-exporter.yaml - postmark-exporter-service.yaml