From da200235bb349021aa7d10cafeaccbe7a435caab Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 18 Jan 2026 12:26:04 -0300 Subject: [PATCH] monitoring: fix glue dashboard queries --- scripts/dashboards_render_atlas.py | 14 ++++++++------ services/monitoring/dashboards/atlas-testing.json | 12 ++++++------ services/monitoring/grafana-dashboard-testing.yaml | 12 ++++++------ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 5e3e7f0..0931b48 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -319,16 +319,18 @@ NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"' NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"] GLUE_LABEL = 'label_atlas_bstein_dev_glue="true"' GLUE_JOBS = f"kube_cronjob_labels{{{GLUE_LABEL}}}" -GLUE_LAST_SUCCESS = f"kube_cronjob_status_last_successful_time{{{GLUE_LABEL}}}" -GLUE_LAST_SCHEDULE = f"kube_cronjob_status_last_schedule_time{{{GLUE_LABEL}}}" -GLUE_SUSPENDED = f"kube_cronjob_spec_suspend{{{GLUE_LABEL}}} == 1" +GLUE_FILTER = f"and on(namespace,cronjob) {GLUE_JOBS}" +GLUE_LAST_SUCCESS = f"(kube_cronjob_status_last_successful_time {GLUE_FILTER})" +GLUE_LAST_SCHEDULE = f"(kube_cronjob_status_last_schedule_time {GLUE_FILTER})" +GLUE_SUSPENDED = f"(kube_cronjob_spec_suspend {GLUE_FILTER}) == 1" +GLUE_ACTIVE = f"(kube_cronjob_status_active {GLUE_FILTER})" GLUE_LAST_SUCCESS_AGE = f"(time() - {GLUE_LAST_SUCCESS})" GLUE_LAST_SCHEDULE_AGE = f"(time() - {GLUE_LAST_SCHEDULE})" GLUE_LAST_SUCCESS_AGE_HOURS = f"({GLUE_LAST_SUCCESS_AGE}) / 3600" GLUE_LAST_SCHEDULE_AGE_HOURS = f"({GLUE_LAST_SCHEDULE_AGE}) / 3600" GLUE_STALE_WINDOW_SEC = 36 * 3600 GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})" -GLUE_MISSING = f"({GLUE_JOBS} unless {GLUE_LAST_SUCCESS})" +GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)" GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})" GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})" GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" @@ -2188,7 +2190,7 @@ def build_testing_dashboard(): table_panel( 3, "Glue Jobs Suspended", - f"kube_cronjob_spec_suspend{{{GLUE_LABEL}}} == 1", + GLUE_SUSPENDED, {"h": 4, "w": 6, "x": 12, "y": 0}, unit="none", transformations=sort_desc, @@ -2199,7 +2201,7 @@ def build_testing_dashboard(): table_panel( 4, "Glue Jobs Active Runs", - f"kube_cronjob_status_active{{{GLUE_LABEL}}}", + GLUE_ACTIVE, {"h": 4, "w": 6, "x": 18, "y": 0}, unit="none", transformations=sort_desc, diff --git a/services/monitoring/dashboards/atlas-testing.json b/services/monitoring/dashboards/atlas-testing.json index 831d960..25cf3f8 100644 --- a/services/monitoring/dashboards/atlas-testing.json +++ b/services/monitoring/dashboards/atlas-testing.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))", + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", "refId": "A" } ], @@ -88,7 +88,7 @@ }, "targets": [ { - "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)", + "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", "refId": "A", "instant": true } @@ -138,7 +138,7 @@ }, "targets": [ { - "expr": "kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1", + "expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1", "refId": "A", "instant": true } @@ -188,7 +188,7 @@ }, "targets": [ { - "expr": "kube_cronjob_status_active{label_atlas_bstein_dev_glue=\"true\"}", + "expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})", "refId": "A", "instant": true } @@ -238,7 +238,7 @@ }, "targets": [ { - "expr": "((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600", + "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", "refId": "A", "instant": true } @@ -288,7 +288,7 @@ }, "targets": [ { - "expr": "((time() - kube_cronjob_status_last_schedule_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600", + "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", "refId": "A", "instant": true } diff --git a/services/monitoring/grafana-dashboard-testing.yaml b/services/monitoring/grafana-dashboard-testing.yaml index 2215a2d..80a7043 100644 --- a/services/monitoring/grafana-dashboard-testing.yaml +++ b/services/monitoring/grafana-dashboard-testing.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))", + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", "refId": "A" } ], @@ -97,7 +97,7 @@ data: }, "targets": [ { - "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)", + "expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", "refId": "A", "instant": true } @@ -147,7 +147,7 @@ data: }, "targets": [ { - "expr": "kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1", + "expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1", "refId": "A", "instant": true } @@ -197,7 +197,7 @@ data: }, "targets": [ { - "expr": "kube_cronjob_status_active{label_atlas_bstein_dev_glue=\"true\"}", + "expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})", "refId": "A", "instant": true } @@ -247,7 +247,7 @@ data: }, "targets": [ { - "expr": "((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600", + "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", "refId": "A", "instant": true } @@ -297,7 +297,7 @@ data: }, "targets": [ { - "expr": "((time() - kube_cronjob_status_last_schedule_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600", + "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", "refId": "A", "instant": true }