monitoring: fix glue dashboard queries
This commit is contained in:
parent
ae3b0afbff
commit
da200235bb
@ -319,16 +319,18 @@ NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
|
||||
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
|
||||
GLUE_LABEL = 'label_atlas_bstein_dev_glue="true"'
|
||||
GLUE_JOBS = f"kube_cronjob_labels{{{GLUE_LABEL}}}"
|
||||
GLUE_LAST_SUCCESS = f"kube_cronjob_status_last_successful_time{{{GLUE_LABEL}}}"
|
||||
GLUE_LAST_SCHEDULE = f"kube_cronjob_status_last_schedule_time{{{GLUE_LABEL}}}"
|
||||
GLUE_SUSPENDED = f"kube_cronjob_spec_suspend{{{GLUE_LABEL}}} == 1"
|
||||
GLUE_FILTER = f"and on(namespace,cronjob) {GLUE_JOBS}"
|
||||
GLUE_LAST_SUCCESS = f"(kube_cronjob_status_last_successful_time {GLUE_FILTER})"
|
||||
GLUE_LAST_SCHEDULE = f"(kube_cronjob_status_last_schedule_time {GLUE_FILTER})"
|
||||
GLUE_SUSPENDED = f"(kube_cronjob_spec_suspend {GLUE_FILTER}) == 1"
|
||||
GLUE_ACTIVE = f"(kube_cronjob_status_active {GLUE_FILTER})"
|
||||
GLUE_LAST_SUCCESS_AGE = f"(time() - {GLUE_LAST_SUCCESS})"
|
||||
GLUE_LAST_SCHEDULE_AGE = f"(time() - {GLUE_LAST_SCHEDULE})"
|
||||
GLUE_LAST_SUCCESS_AGE_HOURS = f"({GLUE_LAST_SUCCESS_AGE}) / 3600"
|
||||
GLUE_LAST_SCHEDULE_AGE_HOURS = f"({GLUE_LAST_SCHEDULE_AGE}) / 3600"
|
||||
GLUE_STALE_WINDOW_SEC = 36 * 3600
|
||||
GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})"
|
||||
GLUE_MISSING = f"({GLUE_JOBS} unless {GLUE_LAST_SUCCESS})"
|
||||
GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)"
|
||||
GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
|
||||
GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
|
||||
GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))"
|
||||
@ -2188,7 +2190,7 @@ def build_testing_dashboard():
|
||||
table_panel(
|
||||
3,
|
||||
"Glue Jobs Suspended",
|
||||
f"kube_cronjob_spec_suspend{{{GLUE_LABEL}}} == 1",
|
||||
GLUE_SUSPENDED,
|
||||
{"h": 4, "w": 6, "x": 12, "y": 0},
|
||||
unit="none",
|
||||
transformations=sort_desc,
|
||||
@ -2199,7 +2201,7 @@ def build_testing_dashboard():
|
||||
table_panel(
|
||||
4,
|
||||
"Glue Jobs Active Runs",
|
||||
f"kube_cronjob_status_active{{{GLUE_LABEL}}}",
|
||||
GLUE_ACTIVE,
|
||||
{"h": 4, "w": 6, "x": 18, "y": 0},
|
||||
unit="none",
|
||||
transformations=sort_desc,
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))",
|
||||
"expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -88,7 +88,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)",
|
||||
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -138,7 +138,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1",
|
||||
"expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -188,7 +188,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "kube_cronjob_status_active{label_atlas_bstein_dev_glue=\"true\"}",
|
||||
"expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -238,7 +238,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600",
|
||||
"expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -288,7 +288,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((time() - kube_cronjob_status_last_schedule_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600",
|
||||
"expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
|
||||
@ -29,7 +29,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))",
|
||||
"expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -97,7 +97,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)",
|
||||
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -147,7 +147,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1",
|
||||
"expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -197,7 +197,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "kube_cronjob_status_active{label_atlas_bstein_dev_glue=\"true\"}",
|
||||
"expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -247,7 +247,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600",
|
||||
"expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
@ -297,7 +297,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((time() - kube_cronjob_status_last_schedule_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600",
|
||||
"expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user