monitoring: enforce sorted job lists

This commit is contained in:
Brad Stein 2026-01-21 15:12:53 -03:00
parent db4c3b7c51
commit 4721d44a33
7 changed files with 58 additions and 74 deletions

View File

@ -748,6 +748,12 @@ def bargauge_panel(
overrides=None,
):
"""Return a bar gauge panel with label-aware reduction."""
cleaned_expr = expr.strip()
if not cleaned_expr.startswith(("sort(", "sort_desc(")):
if sort_order == "desc":
expr = f"sort_desc({expr})"
elif sort_order == "asc":
expr = f"sort({expr})"
panel = {
"id": panel_id,
"type": "bargauge",
@ -1165,21 +1171,20 @@ def build_overview():
{
"id": 41,
"type": "timeseries",
"title": "Ariadne Attempts / Warnings / Failures",
"title": "Ariadne Attempts / Failures",
"datasource": PROM_DS,
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 14},
"targets": [
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
{"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"},
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"},
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
],
"fieldConfig": {
"defaults": {"unit": "none"},
"overrides": [
{
"matcher": {"id": "byName", "options": "Warnings"},
"matcher": {"id": "byName", "options": "Attempts"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
],
},
{
@ -2361,21 +2366,20 @@ def build_jobs_dashboard():
{
"id": 2,
"type": "timeseries",
"title": "Ariadne Attempts / Warnings / Failures",
"title": "Ariadne Attempts / Failures",
"datasource": PROM_DS,
"gridPos": {"h": 7, "w": 8, "x": 8, "y": 0},
"targets": [
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
{"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"},
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"},
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
],
"fieldConfig": {
"defaults": {"unit": "none"},
"overrides": [
{
"matcher": {"id": "byName", "options": "Warnings"},
"matcher": {"id": "byName", "options": "Attempts"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
],
},
{

View File

@ -20,7 +20,7 @@
},
"targets": [
{
"expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))",
"expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))",
"refId": "A",
"legendFormat": "{{task}}",
"instant": true
@ -81,7 +81,7 @@
{
"id": 2,
"type": "timeseries",
"title": "Ariadne Attempts / Warnings / Failures",
"title": "Ariadne Attempts / Failures",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -98,14 +98,9 @@
"refId": "A",
"legendFormat": "Attempts"
},
{
"expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)",
"refId": "B",
"legendFormat": "Warnings"
},
{
"expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
"refId": "C",
"refId": "B",
"legendFormat": "Failures"
}
],
@ -117,14 +112,14 @@
{
"matcher": {
"id": "byName",
"options": "Warnings"
"options": "Attempts"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "yellow"
"fixedColor": "green"
}
}
]
@ -172,7 +167,7 @@
},
"targets": [
{
"expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
"expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pod}}",
"instant": true
@ -621,7 +616,7 @@
},
"targets": [
{
"expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600",
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)",
"refId": "A",
"legendFormat": "{{task}}",
"instant": true
@ -696,7 +691,7 @@
},
"targets": [
{
"expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600",
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)",
"refId": "A",
"legendFormat": "{{task}}",
"instant": true
@ -771,7 +766,7 @@
},
"targets": [
{
"expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600",
"expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)",
"refId": "A",
"legendFormat": "{{namespace}}/{{cronjob}}",
"instant": true
@ -846,7 +841,7 @@
},
"targets": [
{
"expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600",
"expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)",
"refId": "A",
"legendFormat": "{{namespace}}/{{cronjob}}",
"instant": true
@ -921,7 +916,7 @@
},
"targets": [
{
"expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
"expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))",
"refId": "A",
"legendFormat": "{{task}}",
"instant": true
@ -995,7 +990,7 @@
},
"targets": [
{
"expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))",
"expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))",
"refId": "A",
"legendFormat": "{{task}}",
"instant": true
@ -1069,7 +1064,7 @@
},
"targets": [
{
"expr": "ariadne_access_requests_total",
"expr": "sort_desc(ariadne_access_requests_total)",
"refId": "A",
"legendFormat": "{{status}}",
"instant": true

View File

@ -1410,7 +1410,7 @@
},
"targets": [
{
"expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
"expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pod}}",
"instant": true
@ -1478,7 +1478,7 @@
{
"id": 41,
"type": "timeseries",
"title": "Ariadne Attempts / Warnings / Failures",
"title": "Ariadne Attempts / Failures",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1495,14 +1495,9 @@
"refId": "A",
"legendFormat": "Attempts"
},
{
"expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)",
"refId": "B",
"legendFormat": "Warnings"
},
{
"expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
"refId": "C",
"refId": "B",
"legendFormat": "Failures"
}
],
@ -1514,14 +1509,14 @@
{
"matcher": {
"id": "byName",
"options": "Warnings"
"options": "Attempts"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "yellow"
"fixedColor": "green"
}
}
]
@ -1606,7 +1601,7 @@
},
"targets": [
{
"expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))",
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
"refId": "A",
"legendFormat": "{{result}}",
"instant": true
@ -2137,7 +2132,7 @@
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
@ -2398,7 +2393,7 @@
},
"targets": [
{
"expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A",
"legendFormat": "{{node}}"
}

View File

@ -439,7 +439,7 @@
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true

View File

@ -29,7 +29,7 @@ data:
},
"targets": [
{
"expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))",
"expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))",
"refId": "A",
"legendFormat": "{{task}}",
"instant": true
@ -90,7 +90,7 @@ data:
{
"id": 2,
"type": "timeseries",
"title": "Ariadne Attempts / Warnings / Failures",
"title": "Ariadne Attempts / Failures",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -107,14 +107,9 @@ data:
"refId": "A",
"legendFormat": "Attempts"
},
{
"expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)",
"refId": "B",
"legendFormat": "Warnings"
},
{
"expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
"refId": "C",
"refId": "B",
"legendFormat": "Failures"
}
],
@ -126,14 +121,14 @@ data:
{
"matcher": {
"id": "byName",
"options": "Warnings"
"options": "Attempts"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "yellow"
"fixedColor": "green"
}
}
]
@ -181,7 +176,7 @@ data:
},
"targets": [
{
"expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
"expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pod}}",
"instant": true
@ -630,7 +625,7 @@ data:
},
"targets": [
{
"expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600",
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)",
"refId": "A",
"legendFormat": "{{task}}",
"instant": true
@ -705,7 +700,7 @@ data:
},
"targets": [
{
"expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600",
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)",
"refId": "A",
"legendFormat": "{{task}}",
"instant": true
@ -780,7 +775,7 @@ data:
},
"targets": [
{
"expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600",
"expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)",
"refId": "A",
"legendFormat": "{{namespace}}/{{cronjob}}",
"instant": true
@ -855,7 +850,7 @@ data:
},
"targets": [
{
"expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600",
"expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)",
"refId": "A",
"legendFormat": "{{namespace}}/{{cronjob}}",
"instant": true
@ -930,7 +925,7 @@ data:
},
"targets": [
{
"expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))",
"expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))",
"refId": "A",
"legendFormat": "{{task}}",
"instant": true
@ -1004,7 +999,7 @@ data:
},
"targets": [
{
"expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))",
"expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))",
"refId": "A",
"legendFormat": "{{task}}",
"instant": true
@ -1078,7 +1073,7 @@ data:
},
"targets": [
{
"expr": "ariadne_access_requests_total",
"expr": "sort_desc(ariadne_access_requests_total)",
"refId": "A",
"legendFormat": "{{status}}",
"instant": true

View File

@ -1419,7 +1419,7 @@ data:
},
"targets": [
{
"expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})",
"expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pod}}",
"instant": true
@ -1487,7 +1487,7 @@ data:
{
"id": 41,
"type": "timeseries",
"title": "Ariadne Attempts / Warnings / Failures",
"title": "Ariadne Attempts / Failures",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1504,14 +1504,9 @@ data:
"refId": "A",
"legendFormat": "Attempts"
},
{
"expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)",
"refId": "B",
"legendFormat": "Warnings"
},
{
"expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))",
"refId": "C",
"refId": "B",
"legendFormat": "Failures"
}
],
@ -1523,14 +1518,14 @@ data:
{
"matcher": {
"id": "byName",
"options": "Warnings"
"options": "Attempts"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "yellow"
"fixedColor": "green"
}
}
]
@ -1615,7 +1610,7 @@ data:
},
"targets": [
{
"expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))",
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
"refId": "A",
"legendFormat": "{{result}}",
"instant": true
@ -2146,7 +2141,7 @@ data:
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true
@ -2407,7 +2402,7 @@ data:
},
"targets": [
{
"expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A",
"legendFormat": "{{node}}"
}

View File

@ -448,7 +448,7 @@ data:
},
"targets": [
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))",
"refId": "A",
"legendFormat": "{{node}}",
"instant": true