monitoring: fix glue dashboard queries
This commit is contained in:
parent
ae3b0afbff
commit
da200235bb
@ -319,16 +319,18 @@ NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
|
|||||||
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
|
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
|
||||||
GLUE_LABEL = 'label_atlas_bstein_dev_glue="true"'
|
GLUE_LABEL = 'label_atlas_bstein_dev_glue="true"'
|
||||||
GLUE_JOBS = f"kube_cronjob_labels{{{GLUE_LABEL}}}"
|
GLUE_JOBS = f"kube_cronjob_labels{{{GLUE_LABEL}}}"
|
||||||
GLUE_LAST_SUCCESS = f"kube_cronjob_status_last_successful_time{{{GLUE_LABEL}}}"
|
GLUE_FILTER = f"and on(namespace,cronjob) {GLUE_JOBS}"
|
||||||
GLUE_LAST_SCHEDULE = f"kube_cronjob_status_last_schedule_time{{{GLUE_LABEL}}}"
|
GLUE_LAST_SUCCESS = f"(kube_cronjob_status_last_successful_time {GLUE_FILTER})"
|
||||||
GLUE_SUSPENDED = f"kube_cronjob_spec_suspend{{{GLUE_LABEL}}} == 1"
|
GLUE_LAST_SCHEDULE = f"(kube_cronjob_status_last_schedule_time {GLUE_FILTER})"
|
||||||
|
GLUE_SUSPENDED = f"(kube_cronjob_spec_suspend {GLUE_FILTER}) == 1"
|
||||||
|
GLUE_ACTIVE = f"(kube_cronjob_status_active {GLUE_FILTER})"
|
||||||
GLUE_LAST_SUCCESS_AGE = f"(time() - {GLUE_LAST_SUCCESS})"
|
GLUE_LAST_SUCCESS_AGE = f"(time() - {GLUE_LAST_SUCCESS})"
|
||||||
GLUE_LAST_SCHEDULE_AGE = f"(time() - {GLUE_LAST_SCHEDULE})"
|
GLUE_LAST_SCHEDULE_AGE = f"(time() - {GLUE_LAST_SCHEDULE})"
|
||||||
GLUE_LAST_SUCCESS_AGE_HOURS = f"({GLUE_LAST_SUCCESS_AGE}) / 3600"
|
GLUE_LAST_SUCCESS_AGE_HOURS = f"({GLUE_LAST_SUCCESS_AGE}) / 3600"
|
||||||
GLUE_LAST_SCHEDULE_AGE_HOURS = f"({GLUE_LAST_SCHEDULE_AGE}) / 3600"
|
GLUE_LAST_SCHEDULE_AGE_HOURS = f"({GLUE_LAST_SCHEDULE_AGE}) / 3600"
|
||||||
GLUE_STALE_WINDOW_SEC = 36 * 3600
|
GLUE_STALE_WINDOW_SEC = 36 * 3600
|
||||||
GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})"
|
GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})"
|
||||||
GLUE_MISSING = f"({GLUE_JOBS} unless {GLUE_LAST_SUCCESS})"
|
GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)"
|
||||||
GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
|
GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
|
||||||
GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
|
GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})"
|
||||||
GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))"
|
GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))"
|
||||||
@ -2188,7 +2190,7 @@ def build_testing_dashboard():
|
|||||||
table_panel(
|
table_panel(
|
||||||
3,
|
3,
|
||||||
"Glue Jobs Suspended",
|
"Glue Jobs Suspended",
|
||||||
f"kube_cronjob_spec_suspend{{{GLUE_LABEL}}} == 1",
|
GLUE_SUSPENDED,
|
||||||
{"h": 4, "w": 6, "x": 12, "y": 0},
|
{"h": 4, "w": 6, "x": 12, "y": 0},
|
||||||
unit="none",
|
unit="none",
|
||||||
transformations=sort_desc,
|
transformations=sort_desc,
|
||||||
@ -2199,7 +2201,7 @@ def build_testing_dashboard():
|
|||||||
table_panel(
|
table_panel(
|
||||||
4,
|
4,
|
||||||
"Glue Jobs Active Runs",
|
"Glue Jobs Active Runs",
|
||||||
f"kube_cronjob_status_active{{{GLUE_LABEL}}}",
|
GLUE_ACTIVE,
|
||||||
{"h": 4, "w": 6, "x": 18, "y": 0},
|
{"h": 4, "w": 6, "x": 18, "y": 0},
|
||||||
unit="none",
|
unit="none",
|
||||||
transformations=sort_desc,
|
transformations=sort_desc,
|
||||||
|
|||||||
@ -20,7 +20,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))",
|
"expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -88,7 +88,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)",
|
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -138,7 +138,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1",
|
"expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -188,7 +188,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "kube_cronjob_status_active{label_atlas_bstein_dev_glue=\"true\"}",
|
"expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -238,7 +238,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600",
|
"expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -288,7 +288,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "((time() - kube_cronjob_status_last_schedule_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600",
|
"expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,7 +29,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "(sum((((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) > bool 129600) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)))",
|
"expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -97,7 +97,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"}) unless on(namespace,cronjob) kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1)",
|
"expr": "((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -147,7 +147,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "kube_cronjob_spec_suspend{label_atlas_bstein_dev_glue=\"true\"} == 1",
|
"expr": "(kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -197,7 +197,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "kube_cronjob_status_active{label_atlas_bstein_dev_glue=\"true\"}",
|
"expr": "(kube_cronjob_status_active and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -247,7 +247,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "((time() - kube_cronjob_status_last_successful_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600",
|
"expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -297,7 +297,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "((time() - kube_cronjob_status_last_schedule_time{label_atlas_bstein_dev_glue=\"true\"})) / 3600",
|
"expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user