From e222344cd9aae151894863d3344d9c2d4cd8df27 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 12 Apr 2026 20:09:43 -0300 Subject: [PATCH] monitoring(jobs): add schedule fallback series for cold starts --- scripts/dashboards_render_atlas.py | 27 +++++++++++++++---- .../monitoring/dashboards/atlas-jobs.json | 13 +++++---- .../monitoring/grafana-dashboard-jobs.yaml | 13 +++++---- 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 1b4c8e44..c920a4a7 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -431,6 +431,7 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = ( ARIADNE_SCHEDULE_NEXT_RUN_HOURS = ( f"((ariadne_schedule_next_run_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}} - time()) / 3600)" ) +ARIADNE_SCHEDULE_TASK_INDEX = f"ariadne_schedule_next_run_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}}" ARIADNE_SCHEDULE_LAST_STATUS = f"ariadne_schedule_last_status{{{ARIADNE_SCHEDULE_TASK_FILTER}}}" ARIADNE_SCHEDULE_SIGNAL_COUNT = ( f"count(ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_SCHEDULE_TASK_FILTER}}}) or on() vector(0)" @@ -453,6 +454,21 @@ ARIADNE_SCHEDULE_RUNS_RANGE = ( ARIADNE_SCHEDULE_ERRORS_RANGE = ( f'sum by (task) (increase(ariadne_task_runs_total{{status="error",{ARIADNE_SCHEDULE_TASK_FILTER}}}[$__range]))' ) +ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS_FALLBACK = ( + f"({ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS}) or on(task) (0 * {ARIADNE_SCHEDULE_TASK_INDEX} + 999)" +) +ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS_FALLBACK = ( + f"({ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS}) or on(task) (0 * {ARIADNE_SCHEDULE_TASK_INDEX} + 999)" +) +ARIADNE_SCHEDULE_LAST_STATUS_FALLBACK = ( + f"({ARIADNE_SCHEDULE_LAST_STATUS}) or on(task) (0 * {ARIADNE_SCHEDULE_TASK_INDEX} - 1)" +) +ARIADNE_SCHEDULE_RUNS_RANGE_FALLBACK = ( + f"({ARIADNE_SCHEDULE_RUNS_RANGE}) or on(task) (0 * {ARIADNE_SCHEDULE_TASK_INDEX})" +) +ARIADNE_SCHEDULE_ERRORS_RANGE_FALLBACK = ( + f"({ARIADNE_SCHEDULE_ERRORS_RANGE}) or on(task) (0 * {ARIADNE_SCHEDULE_TASK_INDEX})" +) JENKINS_CLEANUP_SIGNAL_COUNT = ( "count(ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds) or on() vector(0)" ) @@ -3072,7 +3088,7 @@ def build_jobs_dashboard(): schedule_list_panel = table_panel( 7, "Ariadne Schedules: Last Success (h, newest first)", - ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS, + ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS_FALLBACK, {"h": 8, "w": 12, "x": 0, "y": 4}, unit="h", transformations=[ @@ -3087,7 +3103,7 @@ def build_jobs_dashboard(): bargauge_panel( 8, "Ariadne Schedule Last Error (hours ago)", - ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS, + ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS_FALLBACK, {"h": 8, "w": 12, "x": 12, "y": 4}, unit="h", instant=True, @@ -3100,7 +3116,7 @@ def build_jobs_dashboard(): status_panel = bargauge_panel( 9, "Ariadne Schedule Last Status", - ARIADNE_SCHEDULE_LAST_STATUS, + ARIADNE_SCHEDULE_LAST_STATUS_FALLBACK, {"h": 8, "w": 8, "x": 0, "y": 12}, unit="none", instant=True, @@ -3114,6 +3130,7 @@ def build_jobs_dashboard(): { "type": "value", "options": { + "-1": {"text": "pending"}, "0": {"text": "error"}, "1": {"text": "ok"}, }, @@ -3123,7 +3140,7 @@ def build_jobs_dashboard(): schedule_runs_panel = bargauge_panel( 10, "Ariadne Schedule Runs (range)", - ARIADNE_SCHEDULE_RUNS_RANGE, + ARIADNE_SCHEDULE_RUNS_RANGE_FALLBACK, {"h": 8, "w": 8, "x": 8, "y": 12}, unit="none", instant=True, @@ -3135,7 +3152,7 @@ def build_jobs_dashboard(): schedule_errors_panel = bargauge_panel( 11, "Ariadne Schedule Errors (range)", - ARIADNE_SCHEDULE_ERRORS_RANGE, + ARIADNE_SCHEDULE_ERRORS_RANGE_FALLBACK, {"h": 8, "w": 8, "x": 16, "y": 12}, unit="none", instant=True, diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 79ed97e4..d58d7f52 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -421,7 +421,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600", + "expr": "((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600) or on(task) (0 * ariadne_schedule_next_run_timestamp_seconds{task=~\"^schedule\\..+$\"} + 999)", "refId": "A", "instant": true } @@ -472,7 +472,7 @@ }, "targets": [ { - "expr": "sort((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600)", + "expr": "sort(((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600) or on(task) (0 * ariadne_schedule_next_run_timestamp_seconds{task=~\"^schedule\\..+$\"} + 999))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -547,7 +547,7 @@ }, "targets": [ { - "expr": "sort(ariadne_schedule_last_status{task=~\"^schedule\\..+$\"})", + "expr": "sort((ariadne_schedule_last_status{task=~\"^schedule\\..+$\"}) or on(task) (0 * ariadne_schedule_next_run_timestamp_seconds{task=~\"^schedule\\..+$\"} - 1))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -580,6 +580,9 @@ { "type": "value", "options": { + "-1": { + "text": "pending" + }, "0": { "text": "error" }, @@ -632,7 +635,7 @@ }, "targets": [ { - "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{task=~\"^schedule\\..+$\"}[$__range])))", + "expr": "sort_desc((sum by (task) (increase(ariadne_task_runs_total{task=~\"^schedule\\..+$\"}[$__range]))) or on(task) (0 * ariadne_schedule_next_run_timestamp_seconds{task=~\"^schedule\\..+$\"}))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -695,7 +698,7 @@ }, "targets": [ { - "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\",task=~\"^schedule\\..+$\"}[$__range])))", + "expr": "sort_desc((sum by (task) (increase(ariadne_task_runs_total{status=\"error\",task=~\"^schedule\\..+$\"}[$__range]))) or on(task) (0 * ariadne_schedule_next_run_timestamp_seconds{task=~\"^schedule\\..+$\"}))", "refId": "A", "legendFormat": "{{task}}", "instant": true diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index f4ffb16e..982ef51b 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -430,7 +430,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600", + "expr": "((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600) or on(task) (0 * ariadne_schedule_next_run_timestamp_seconds{task=~\"^schedule\\..+$\"} + 999)", "refId": "A", "instant": true } @@ -481,7 +481,7 @@ data: }, "targets": [ { - "expr": "sort((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600)", + "expr": "sort(((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^schedule\\..+$\"}[$__range])) / 3600) or on(task) (0 * ariadne_schedule_next_run_timestamp_seconds{task=~\"^schedule\\..+$\"} + 999))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -556,7 +556,7 @@ data: }, "targets": [ { - "expr": "sort(ariadne_schedule_last_status{task=~\"^schedule\\..+$\"})", + "expr": "sort((ariadne_schedule_last_status{task=~\"^schedule\\..+$\"}) or on(task) (0 * ariadne_schedule_next_run_timestamp_seconds{task=~\"^schedule\\..+$\"} - 1))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -589,6 +589,9 @@ data: { "type": "value", "options": { + "-1": { + "text": "pending" + }, "0": { "text": "error" }, @@ -641,7 +644,7 @@ data: }, "targets": [ { - "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{task=~\"^schedule\\..+$\"}[$__range])))", + "expr": "sort_desc((sum by (task) (increase(ariadne_task_runs_total{task=~\"^schedule\\..+$\"}[$__range]))) or on(task) (0 * ariadne_schedule_next_run_timestamp_seconds{task=~\"^schedule\\..+$\"}))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -704,7 +707,7 @@ data: }, "targets": [ { - "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\",task=~\"^schedule\\..+$\"}[$__range])))", + "expr": "sort_desc((sum by (task) (increase(ariadne_task_runs_total{status=\"error\",task=~\"^schedule\\..+$\"}[$__range]))) or on(task) (0 * ariadne_schedule_next_run_timestamp_seconds{task=~\"^schedule\\..+$\"}))", "refId": "A", "legendFormat": "{{task}}", "instant": true