monitoring: fix glue dashboard queries

This commit is contained in:
Brad Stein 2026-01-18 12:26:04 -03:00
parent ae3b0afbff
commit da200235bb
3 changed files with 20 additions and 18 deletions

View File

@ -319,16 +319,18 @@ NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
GLUE_LABEL = 'label_atlas_bstein_dev_glue="true"'
GLUE_JOBS = f"kube_cronjob_labels{{{GLUE_LABEL}}}"
GLUE_LAST_SUCCESS = f"kube_cronjob_status_last_successful_time{{{GLUE_LABEL}}}"
GLUE_LAST_SCHEDULE = f"kube_cronjob_status_last_schedule_time{{{GLUE_LABEL}}}"
GLUE_SUSPENDED = f"kube_cronjob_spec_suspend{{{GLUE_LABEL}}} == 1"
GLUE_FILTER = f"and on(namespace,cronjob) {GLUE_JOBS}"
GLUE_LAST_SUCCESS = f"(kube_cronjob_status_last_successful_time {GLUE_FILTER})"
GLUE_LAST_SCHEDULE = f"(kube_cronjob_status_last_schedule_time {GLUE_FILTER})"
GLUE_SUSPENDED = f"(kube_cronjob_spec_suspend {GLUE_FILTER}) == 1"
GLUE_ACTIVE = f"(kube_cronjob_status_active {GLUE_FILTER})"
GLUE_LAST_SUCCESS_AGE = f"(time() - {GLUE_LAST_SUCCESS})"
GLUE_LAST_SCHEDULE_AGE = f"(time() - {GLUE_LAST_SCHEDULE})"
GLUE_LAST_SUCCESS_AGE_HOURS = f"({GLUE_LAST_SUCCESS_AGE}) / 3600"
GLUE_LAST_SCHEDULE_AGE_HOURS = f"({GLUE_LAST_SCHEDULE_AGE}) / 3600"
GLUE_STALE_WINDOW_SEC = 36 * 3600
GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})"
GLUE_MISSING = f"({GLUE_JOBS} unless {GLUE_LAST_SUCCESS})"
GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)"
GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))"
@ -2188,7 +2190,7 @@ def build_testing_dashboard():
table_panel(
3,
"Glue Jobs Suspended",
f"kube_cronjob_spec_suspend{{{GLUE_LABEL}}} == 1",
GLUE_SUSPENDED,
{"h": 4, "w": 6, "x": 12, "y": 0},
unit="none",
transformations=sort_desc,
@ -2199,7 +2201,7 @@ def build_testing_dashboard():
table_panel(
4,
"Glue Jobs Active Runs",
f"kube_cronjob_status_active{{{GLUE_LABEL}}}",
GLUE_ACTIVE,
{"h": 4, "w": 6, "x": 18, "y": 0},
unit="none",
transformations=sort_desc,

View File

@ -20,7 +20,7 @@
},
"targets": [
{
"expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))",
"expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
"refId": "A"
}
],
@ -88,7 +88,7 @@
},
"targets": [
{
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)",
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
"refId": "A",
"instant": true
}
@ -138,7 +138,7 @@
},
"targets": [
{
"expr": "kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1",
"expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1",
"refId": "A",
"instant": true
}
@ -188,7 +188,7 @@
},
"targets": [
{
"expr": "kube_cronjob_status_active{label_atlas_bstein_dev_glue=\"true\"}",
"expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})",
"refId": "A",
"instant": true
}
@ -238,7 +238,7 @@
},
"targets": [
{
"expr": "((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600",
"expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
"refId": "A",
"instant": true
}
@ -288,7 +288,7 @@
},
"targets": [
{
"expr": "((time() - kube_cronjob_status_last_schedule_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600",
"expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
"refId": "A",
"instant": true
}

View File

@ -29,7 +29,7 @@ data:
},
"targets": [
{
"expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))",
"expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
"refId": "A"
}
],
@ -97,7 +97,7 @@ data:
},
"targets": [
{
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)",
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
"refId": "A",
"instant": true
}
@ -147,7 +147,7 @@ data:
},
"targets": [
{
"expr": "kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1",
"expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1",
"refId": "A",
"instant": true
}
@ -197,7 +197,7 @@ data:
},
"targets": [
{
"expr": "kube_cronjob_status_active{label_atlas_bstein_dev_glue=\"true\"}",
"expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})",
"refId": "A",
"instant": true
}
@ -247,7 +247,7 @@ data:
},
"targets": [
{
"expr": "((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600",
"expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
"refId": "A",
"instant": true
}
@ -297,7 +297,7 @@ data:
},
"targets": [
{
"expr": "((time() - kube_cronjob_status_last_schedule_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600",
"expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
"refId": "A",
"instant": true
}