diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index c3f3655..1f28489 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -748,6 +748,12 @@ def bargauge_panel( overrides=None, ): """Return a bar gauge panel with label-aware reduction.""" + cleaned_expr = expr.strip() + if not cleaned_expr.startswith(("sort(", "sort_desc(")): + if sort_order == "desc": + expr = f"sort_desc({expr})" + elif sort_order == "asc": + expr = f"sort({expr})" panel = { "id": panel_id, "type": "bargauge", @@ -1165,21 +1171,20 @@ def build_overview(): { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": PROM_DS, "gridPos": {"h": 6, "w": 6, "x": 6, "y": 14}, "targets": [ {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"}, - {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"}, ], "fieldConfig": { "defaults": {"unit": "none"}, "overrides": [ { - "matcher": {"id": "byName", "options": "Warnings"}, + "matcher": {"id": "byName", "options": "Attempts"}, "properties": [ - {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}} + {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}} ], }, { @@ -2361,21 +2366,20 @@ def build_jobs_dashboard(): { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": PROM_DS, "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0}, "targets": [ {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"}, - {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"}, ], "fieldConfig": { "defaults": {"unit": "none"}, "overrides": [ { - "matcher": {"id": "byName", "options": "Warnings"}, + "matcher": {"id": "byName", "options": "Attempts"}, "properties": [ - {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}} + {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}} ], }, { diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 810b3b3..37b888d 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -81,7 +81,7 @@ { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -98,14 +98,9 @@ "refId": "A", "legendFormat": "Attempts" }, - { - "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", - "refId": "B", - "legendFormat": "Warnings" - }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "C", + "refId": "B", "legendFormat": "Failures" } ], @@ -117,14 +112,14 @@ { "matcher": { "id": "byName", - "options": "Warnings" + "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", - "fixedColor": "yellow" + "fixedColor": "green" } } ] @@ -172,7 +167,7 @@ }, "targets": [ { - "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true @@ -621,7 +616,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -696,7 +691,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -771,7 +766,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -846,7 +841,7 @@ }, "targets": [ { - "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -921,7 +916,7 @@ }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -995,7 +990,7 @@ }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -1069,7 +1064,7 @@ }, "targets": [ { - "expr": "ariadne_access_requests_total", + "expr": "sort_desc(ariadne_access_requests_total)", "refId": "A", "legendFormat": "{{status}}", "instant": true diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 3feb531..78744da 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1410,7 +1410,7 @@ }, "targets": [ { - "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true @@ -1478,7 +1478,7 @@ { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1495,14 +1495,9 @@ "refId": "A", "legendFormat": "Attempts" }, - { - "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", - "refId": "B", - "legendFormat": "Warnings" - }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "C", + "refId": "B", "legendFormat": "Failures" } ], @@ -1514,14 +1509,14 @@ { "matcher": { "id": "byName", - "options": "Warnings" + "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", - "fixedColor": "yellow" + "fixedColor": "green" } } ] @@ -1606,7 +1601,7 @@ }, "targets": [ { - "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))", + "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))", "refId": "A", "legendFormat": "{{result}}", "instant": true @@ -2137,7 +2132,7 @@ }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -2398,7 +2393,7 @@ }, "targets": [ { - "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index e36aa1f..0c8104c 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -439,7 +439,7 @@ }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 279d959..b16c9cb 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -90,7 +90,7 @@ data: { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -107,14 +107,9 @@ data: "refId": "A", "legendFormat": "Attempts" }, - { - "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", - "refId": "B", - "legendFormat": "Warnings" - }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "C", + "refId": "B", "legendFormat": "Failures" } ], @@ -126,14 +121,14 @@ data: { "matcher": { "id": "byName", - "options": "Warnings" + "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", - "fixedColor": "yellow" + "fixedColor": "green" } } ] @@ -181,7 +176,7 @@ data: }, "targets": [ { - "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true @@ -630,7 +625,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -705,7 +700,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -780,7 +775,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -855,7 +850,7 @@ data: }, "targets": [ { - "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", + "expr": "sort_desc((time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600)", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -930,7 +925,7 @@ data: }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[1h])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -1004,7 +999,7 @@ data: }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d]))", + "expr": "sort_desc(sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[30d])))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -1078,7 +1073,7 @@ data: }, "targets": [ { - "expr": "ariadne_access_requests_total", + "expr": "sort_desc(ariadne_access_requests_total)", "refId": "A", "legendFormat": "{{status}}", "instant": true diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 66b6da0..fa19911 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1419,7 +1419,7 @@ data: }, "targets": [ { - "expr": "((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})", + "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", "refId": "A", "legendFormat": "{{namespace}}/{{pod}}", "instant": true @@ -1487,7 +1487,7 @@ data: { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts / Warnings / Failures", + "title": "Ariadne Attempts / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1504,14 +1504,9 @@ data: "refId": "A", "legendFormat": "Attempts" }, - { - "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", - "refId": "B", - "legendFormat": "Warnings" - }, { "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "C", + "refId": "B", "legendFormat": "Failures" } ], @@ -1523,14 +1518,14 @@ data: { "matcher": { "id": "byName", - "options": "Warnings" + "options": "Attempts" }, "properties": [ { "id": "color", "value": { "mode": "fixed", - "fixedColor": "yellow" + "fixedColor": "green" } } ] @@ -1615,7 +1610,7 @@ data: }, "targets": [ { - "expr": "sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h]))", + "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))", "refId": "A", "legendFormat": "{{result}}", "instant": true @@ -2146,7 +2141,7 @@ data: }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -2407,7 +2402,7 @@ data: }, "targets": [ { - "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 6273023..1461eac 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -448,7 +448,7 @@ data: }, "targets": [ { - "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", + "expr": "sort_desc(topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node)))", "refId": "A", "legendFormat": "{{node}}", "instant": true