From b0996e9a4fd67e0d3c93ac144738434d6d922ae1 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Wed, 21 Jan 2026 14:30:55 -0300 Subject: [PATCH] monitoring: refine jobs/overview panels --- scripts/dashboards_render_atlas.py | 162 ++++++++++++------ .../monitoring/dashboards/atlas-jobs.json | 119 ++++++++----- .../monitoring/dashboards/atlas-nodes.json | 4 +- .../monitoring/dashboards/atlas-overview.json | 135 +++++++++------ .../monitoring/dashboards/atlas-pods.json | 2 +- .../monitoring/grafana-dashboard-jobs.yaml | 119 ++++++++----- .../monitoring/grafana-dashboard-nodes.yaml | 4 +- .../grafana-dashboard-overview.yaml | 135 +++++++++------ .../monitoring/grafana-dashboard-pods.yaml | 2 +- 9 files changed, 446 insertions(+), 236 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 1235a0a..3d581c7 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -70,6 +70,7 @@ WORKER_NODES = [ "titan-13", "titan-14", "titan-15", + "titan-16", "titan-17", "titan-18", "titan-19", @@ -333,9 +334,10 @@ GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})" GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)" GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})" GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})" -GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" -GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" -GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" +GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE})) or on() vector(0)" +GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE}) or on() vector(0)" +GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED}) or on() vector(0)" +ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))' ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))' @@ -344,10 +346,19 @@ ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_to ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))' -ARIADNE_TASK_ATTEMPTS_1H = 'sum(increase(ariadne_task_runs_total[1h]))' -ARIADNE_TASK_FAILURES_1H = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' +ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))' +ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))' +ARIADNE_TASK_WARNINGS_SERIES = ( + 'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)' +) ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_SCHEDULE_LAST_ERROR_HOURS = "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600" +ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = ( + "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600" +) +ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = ( + "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600" +) ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' @@ -370,6 +381,8 @@ ONEOFF_JOB_POD_AGE_HOURS = ( '* on(namespace,pod) group_left(phase) ' 'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})' ) +GLUE_LAST_SUCCESS_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SUCCESS}[$__range])) / 3600" +GLUE_LAST_SCHEDULE_RANGE_HOURS = f"(time() - max_over_time({GLUE_LAST_SCHEDULE}[$__range])) / 3600" GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" @@ -1032,7 +1045,7 @@ def build_overview(): 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 3, "w": 5, "x": 0, "y": 8}, + {"h": 3, "w": 6, "x": 0, "y": 8}, unit="none", links=link_to("atlas-mail"), ) @@ -1043,7 +1056,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 3, "w": 5, "x": 10, "y": 8}, + "gridPos": {"h": 3, "w": 6, "x": 12, "y": 8}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1089,7 +1102,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 3, "w": 5, "x": 5, "y": 8}, + {"h": 3, "w": 6, "x": 6, "y": 8}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1101,7 +1114,7 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 3, "w": 5, "x": 15, "y": 8}, + {"h": 3, "w": 6, "x": 18, "y": 8}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, @@ -1121,7 +1134,7 @@ def build_overview(): panel_id, title, expr, - {"h": 5, "w": 6, "x": 6 * idx, "y": 11}, + {"h": 3, "w": 6, "x": 6 * idx, "y": 11}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, links=link_to("atlas-storage"), @@ -1133,26 +1146,44 @@ def build_overview(): 40, "One-off Job Pods (age hours)", ONEOFF_JOB_POD_AGE_HOURS, - {"h": 6, "w": 4, "x": 0, "y": 16}, + {"h": 6, "w": 6, "x": 0, "y": 14}, unit="h", instant=True, legend="{{namespace}}/{{pod}}", thresholds=age_thresholds, limit=8, + decimals=2, ) ) panels.append( { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": PROM_DS, - "gridPos": {"h": 6, "w": 8, "x": 4, "y": 16}, + "gridPos": {"h": 6, "w": 6, "x": 6, "y": 14}, "targets": [ - {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"}, + {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"}, ], - "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "fieldConfig": { + "defaults": {"unit": "none"}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Warnings"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}} + ], + }, + { + "matcher": {"id": "byName", "options": "Failures"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}} + ], + }, + ], + }, "options": { "legend": {"displayMode": "table", "placement": "right"}, "tooltip": {"mode": "multi"}, @@ -1164,7 +1195,7 @@ def build_overview(): 42, "Ariadne Test Success Rate", ARIADNE_TEST_SUCCESS_RATE, - {"h": 6, "w": 8, "x": 12, "y": 16}, + {"h": 6, "w": 6, "x": 12, "y": 14}, unit="percent", legend=None, legend_display="list", @@ -1175,7 +1206,7 @@ def build_overview(): 43, "Tests with Failures (24h)", ARIADNE_TEST_FAILURES_24H, - {"h": 6, "w": 4, "x": 20, "y": 16}, + {"h": 6, "w": 6, "x": 18, "y": 14}, unit="none", instant=True, legend="{{result}}", @@ -1200,7 +1231,7 @@ def build_overview(): 11, "Namespace CPU Share", namespace_cpu_share_expr(cpu_scope), - {"h": 9, "w": 8, "x": 0, "y": 22}, + {"h": 9, "w": 8, "x": 0, "y": 20}, links=namespace_scope_links("namespace_scope_cpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1210,7 +1241,7 @@ def build_overview(): 12, "Namespace GPU Share", namespace_gpu_share_expr(gpu_scope), - {"h": 9, "w": 8, "x": 8, "y": 22}, + {"h": 9, "w": 8, "x": 8, "y": 20}, links=namespace_scope_links("namespace_scope_gpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1220,7 +1251,7 @@ def build_overview(): 13, "Namespace RAM Share", namespace_ram_share_expr(ram_scope), - {"h": 9, "w": 8, "x": 16, "y": 22}, + {"h": 9, "w": 8, "x": 16, "y": 20}, links=namespace_scope_links("namespace_scope_ram"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) @@ -1232,7 +1263,7 @@ def build_overview(): 14, "Worker Node CPU", node_cpu_expr(worker_filter), - {"h": 12, "w": 12, "x": 0, "y": 38}, + {"h": 12, "w": 12, "x": 0, "y": 36}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1246,7 +1277,7 @@ def build_overview(): 15, "Worker Node RAM", node_mem_expr(worker_filter), - {"h": 12, "w": 12, "x": 12, "y": 38}, + {"h": 12, "w": 12, "x": 12, "y": 36}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1261,7 +1292,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 0, "y": 50}, + {"h": 10, "w": 12, "x": 0, "y": 48}, unit="percent", legend="{{node}}", legend_display="table", @@ -1273,7 +1304,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_ALL_REGEX), - {"h": 10, "w": 12, "x": 12, "y": 50}, + {"h": 10, "w": 12, "x": 12, "y": 48}, unit="percent", legend="{{node}}", legend_display="table", @@ -1286,7 +1317,7 @@ def build_overview(): 28, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', - {"h": 10, "w": 12, "x": 0, "y": 60}, + {"h": 10, "w": 12, "x": 0, "y": 58}, ) ) panels.append( @@ -1294,7 +1325,7 @@ def build_overview(): 29, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', - {"h": 10, "w": 12, "x": 12, "y": 60}, + {"h": 10, "w": 12, "x": 12, "y": 58}, unit="none", limit=12, decimals=0, @@ -1316,7 +1347,7 @@ def build_overview(): 18, "Cluster Ingress Throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 8, "x": 0, "y": 31}, + {"h": 7, "w": 8, "x": 0, "y": 29}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", @@ -1329,7 +1360,7 @@ def build_overview(): 19, "Cluster Egress Throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 8, "x": 8, "y": 31}, + {"h": 7, "w": 8, "x": 8, "y": 29}, unit="Bps", legend="Egress (Traefik)", legend_display="list", @@ -1342,7 +1373,7 @@ def build_overview(): 20, "Intra-Cluster Throughput", NET_INTERNAL_EXPR, - {"h": 7, "w": 8, "x": 16, "y": 31}, + {"h": 7, "w": 8, "x": 16, "y": 29}, unit="Bps", legend="Internal traffic", legend_display="list", @@ -1356,7 +1387,7 @@ def build_overview(): 21, "Root Filesystem Usage", root_usage_expr(), - {"h": 16, "w": 12, "x": 0, "y": 70}, + {"h": 16, "w": 12, "x": 0, "y": 68}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -1371,7 +1402,7 @@ def build_overview(): 22, "Nodes Closest to Full Root Disks", f"topk(12, {root_usage_expr()})", - {"h": 16, "w": 12, "x": 12, "y": 70}, + {"h": 16, "w": 12, "x": 12, "y": 68}, unit="percent", thresholds=PERCENT_THRESHOLDS, links=link_to("atlas-storage"), @@ -2300,9 +2331,9 @@ def build_jobs_dashboard(): panels.append( bargauge_panel( 1, - "Ariadne Task Errors (24h)", - ARIADNE_TASK_ERRORS_24H, - {"h": 7, "w": 6, "x": 0, "y": 0}, + "Ariadne Task Errors (range)", + ARIADNE_TASK_ERRORS_RANGE, + {"h": 7, "w": 8, "x": 0, "y": 0}, unit="none", instant=True, legend="{{task}}", @@ -2313,14 +2344,31 @@ def build_jobs_dashboard(): { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": PROM_DS, - "gridPos": {"h": 7, "w": 12, "x": 6, "y": 0}, + "gridPos": {"h": 7, "w": 8, "x": 8, "y": 0}, "targets": [ - {"expr": ARIADNE_TASK_ATTEMPTS_1H, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_FAILURES_1H, "refId": "B", "legendFormat": "Failures"}, + {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, + {"expr": ARIADNE_TASK_WARNINGS_SERIES, "refId": "B", "legendFormat": "Warnings"}, + {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "C", "legendFormat": "Failures"}, ], - "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "fieldConfig": { + "defaults": {"unit": "none"}, + "overrides": [ + { + "matcher": {"id": "byName", "options": "Warnings"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}} + ], + }, + { + "matcher": {"id": "byName", "options": "Failures"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}} + ], + }, + ], + }, "options": { "legend": {"displayMode": "table", "placement": "right"}, "tooltip": {"mode": "multi"}, @@ -2332,12 +2380,13 @@ def build_jobs_dashboard(): 3, "One-off Job Pods (age hours)", ONEOFF_JOB_POD_AGE_HOURS, - {"h": 7, "w": 6, "x": 18, "y": 0}, + {"h": 7, "w": 8, "x": 16, "y": 0}, unit="h", instant=True, legend="{{namespace}}/{{pod}}", thresholds=age_thresholds, limit=12, + decimals=2, ) ) panels.append( @@ -2407,48 +2456,53 @@ def build_jobs_dashboard(): bargauge_panel( 10, "Ariadne Schedule Last Error (hours ago)", - ARIADNE_SCHEDULE_LAST_ERROR_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 11}, + ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS, + {"h": 6, "w": 12, "x": 0, "y": 17}, unit="h", instant=True, legend="{{task}}", thresholds=recent_error_thresholds, + sort_order="asc", + decimals=2, ) ) panels.append( bargauge_panel( 11, "Ariadne Schedule Last Success (hours ago)", - ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 11}, + ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS, + {"h": 6, "w": 12, "x": 12, "y": 17}, unit="h", instant=True, legend="{{task}}", thresholds=age_thresholds, + decimals=2, ) ) panels.append( bargauge_panel( 12, "Glue Jobs Last Success (hours ago)", - GLUE_LAST_SUCCESS_AGE_HOURS, - {"h": 8, "w": 12, "x": 0, "y": 19}, + GLUE_LAST_SUCCESS_RANGE_HOURS, + {"h": 6, "w": 12, "x": 0, "y": 23}, unit="h", instant=True, legend="{{namespace}}/{{cronjob}}", thresholds=age_thresholds, + decimals=2, ) ) panels.append( bargauge_panel( 13, "Glue Jobs Last Schedule (hours ago)", - GLUE_LAST_SCHEDULE_AGE_HOURS, - {"h": 8, "w": 12, "x": 12, "y": 19}, + GLUE_LAST_SCHEDULE_RANGE_HOURS, + {"h": 6, "w": 12, "x": 12, "y": 23}, unit="h", instant=True, legend="{{namespace}}/{{cronjob}}", thresholds=age_thresholds, + decimals=2, ) ) panels.append( @@ -2456,7 +2510,7 @@ def build_jobs_dashboard(): 14, "Ariadne Task Errors (1h)", ARIADNE_TASK_ERRORS_1H, - {"h": 8, "w": 12, "x": 0, "y": 27}, + {"h": 6, "w": 12, "x": 0, "y": 29}, unit="none", instant=True, legend="{{task}}", @@ -2468,7 +2522,7 @@ def build_jobs_dashboard(): 15, "Ariadne Task Errors (30d)", ARIADNE_TASK_ERRORS_30D, - {"h": 8, "w": 12, "x": 12, "y": 27}, + {"h": 6, "w": 12, "x": 12, "y": 29}, unit="none", instant=True, legend="{{task}}", @@ -2480,7 +2534,7 @@ def build_jobs_dashboard(): 16, "Ariadne Access Requests", ARIADNE_ACCESS_REQUESTS, - {"h": 6, "w": 8, "x": 0, "y": 35}, + {"h": 6, "w": 8, "x": 0, "y": 11}, unit="none", instant=True, legend="{{status}}", @@ -2491,7 +2545,7 @@ def build_jobs_dashboard(): 17, "Ariadne CI Coverage (%)", ARIADNE_CI_COVERAGE, - {"h": 6, "w": 4, "x": 8, "y": 35}, + {"h": 6, "w": 4, "x": 8, "y": 11}, unit="percent", decimals=1, instant=True, @@ -2503,7 +2557,7 @@ def build_jobs_dashboard(): 18, "Ariadne CI Tests (latest)", ARIADNE_CI_TESTS, - {"h": 6, "w": 12, "x": 12, "y": 35}, + {"h": 6, "w": 12, "x": 12, "y": 11}, unit="none", transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], instant=True, diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 76e21f0..c70e9c0 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -7,20 +7,20 @@ { "id": 1, "type": "bargauge", - "title": "Ariadne Task Errors (24h)", + "title": "Ariadne Task Errors (range)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, - "w": 6, + "w": 8, "x": 0, "y": 0 }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -81,26 +81,31 @@ { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, - "w": 12, - "x": 6, + "w": 8, + "x": 8, "y": 0 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", "refId": "A", "legendFormat": "Attempts" }, { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", "refId": "B", + "legendFormat": "Warnings" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "C", "legendFormat": "Failures" } ], @@ -108,7 +113,38 @@ "defaults": { "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warnings" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "legend": { @@ -130,8 +166,8 @@ }, "gridPos": { "h": 7, - "w": 6, - "x": 18, + "w": 8, + "x": 16, "y": 0 }, "targets": [ @@ -167,7 +203,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -216,7 +253,7 @@ }, "targets": [ { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)", "refId": "A" } ], @@ -284,7 +321,7 @@ }, "targets": [ { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)", "refId": "A" } ], @@ -344,7 +381,7 @@ }, "targets": [ { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)", "refId": "A" } ], @@ -577,14 +614,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 11 + "y": 17 }, "targets": [ { - "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -615,7 +652,8 @@ "value": 24 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -637,7 +675,7 @@ "fields": [ "Value" ], - "order": "desc" + "order": "asc" } } ] @@ -651,14 +689,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 11 + "y": 17 }, "targets": [ { - "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -689,7 +727,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -725,14 +764,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 19 + "y": 23 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -763,7 +802,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -799,14 +839,14 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 19 + "y": 23 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -837,7 +877,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -873,10 +914,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 27 + "y": 29 }, "targets": [ { @@ -947,10 +988,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 27 + "y": 29 }, "targets": [ { @@ -1024,7 +1065,7 @@ "h": 6, "w": 8, "x": 0, - "y": 35 + "y": 11 }, "targets": [ { @@ -1098,7 +1139,7 @@ "h": 6, "w": 4, "x": 8, - "y": 35 + "y": 11 }, "targets": [ { @@ -1161,7 +1202,7 @@ "h": 6, "w": 12, "x": 12, - "y": 35 + "y": 11 }, "targets": [ { diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index 2d60042..ea59579 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -46,7 +46,7 @@ "unit": "none", "custom": { "displayMode": "auto", - "valueSuffix": "/19" + "valueSuffix": "/20" } }, "overrides": [] diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index c3ff327..5acc2a3 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -449,14 +449,14 @@ }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, - "max": 19, + "max": 20, "thresholds": { "mode": "absolute", "steps": [ @@ -466,15 +466,15 @@ }, { "color": "orange", - "value": 17 - }, - { - "color": "yellow", "value": 18 }, { - "color": "green", + "color": "yellow", "value": 19 + }, + { + "color": "green", + "value": 20 } ] } @@ -796,7 +796,7 @@ }, "gridPos": { "h": 3, - "w": 5, + "w": 6, "x": 0, "y": 8 }, @@ -863,8 +863,8 @@ }, "gridPos": { "h": 3, - "w": 5, - "x": 10, + "w": 6, + "x": 12, "y": 8 }, "targets": [ @@ -968,8 +968,8 @@ }, "gridPos": { "h": 3, - "w": 5, - "x": 5, + "w": 6, + "x": 6, "y": 8 }, "targets": [ @@ -1044,8 +1044,8 @@ }, "gridPos": { "h": 3, - "w": 5, - "x": 15, + "w": 6, + "x": 18, "y": 8 }, "targets": [ @@ -1119,7 +1119,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 0, "y": 11 @@ -1194,7 +1194,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 6, "y": 11 @@ -1269,7 +1269,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 12, "y": 11 @@ -1336,7 +1336,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 18, "y": 11 @@ -1404,9 +1404,9 @@ }, "gridPos": { "h": 6, - "w": 4, + "w": 6, "x": 0, - "y": 16 + "y": 14 }, "targets": [ { @@ -1441,7 +1441,8 @@ "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -1477,26 +1478,31 @@ { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, - "w": 8, - "x": 4, - "y": 16 + "w": 6, + "x": 6, + "y": 14 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", "refId": "A", "legendFormat": "Attempts" }, { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", "refId": "B", + "legendFormat": "Warnings" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "C", "legendFormat": "Failures" } ], @@ -1504,7 +1510,38 @@ "defaults": { "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warnings" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "legend": { @@ -1526,9 +1563,9 @@ }, "gridPos": { "h": 6, - "w": 8, + "w": 6, "x": 12, - "y": 16 + "y": 14 }, "targets": [ { @@ -1562,9 +1599,9 @@ }, "gridPos": { "h": 6, - "w": 4, - "x": 20, - "y": 16 + "w": 6, + "x": 18, + "y": 14 }, "targets": [ { @@ -1638,7 +1675,7 @@ "h": 9, "w": 8, "x": 0, - "y": 22 + "y": 20 }, "targets": [ { @@ -1707,7 +1744,7 @@ "h": 9, "w": 8, "x": 8, - "y": 22 + "y": 20 }, "targets": [ { @@ -1776,7 +1813,7 @@ "h": 9, "w": 8, "x": 16, - "y": 22 + "y": 20 }, "targets": [ { @@ -1845,11 +1882,11 @@ "h": 12, "w": 12, "x": 0, - "y": 38 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1892,11 +1929,11 @@ "h": 12, "w": 12, "x": 12, - "y": 38 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1939,7 +1976,7 @@ "h": 10, "w": 12, "x": 0, - "y": 50 + "y": 48 }, "targets": [ { @@ -1976,7 +2013,7 @@ "h": 10, "w": 12, "x": 12, - "y": 50 + "y": 48 }, "targets": [ { @@ -2013,7 +2050,7 @@ "h": 10, "w": 12, "x": 0, - "y": 60 + "y": 58 }, "targets": [ { @@ -2064,7 +2101,7 @@ "h": 10, "w": 12, "x": 12, - "y": 60 + "y": 58 }, "targets": [ { @@ -2145,7 +2182,7 @@ "h": 7, "w": 8, "x": 0, - "y": 31 + "y": 29 }, "targets": [ { @@ -2189,7 +2226,7 @@ "h": 7, "w": 8, "x": 8, - "y": 31 + "y": 29 }, "targets": [ { @@ -2233,7 +2270,7 @@ "h": 7, "w": 8, "x": 16, - "y": 31 + "y": 29 }, "targets": [ { @@ -2277,7 +2314,7 @@ "h": 16, "w": 12, "x": 0, - "y": 70 + "y": 68 }, "targets": [ { @@ -2325,7 +2362,7 @@ "h": 16, "w": 12, "x": 12, - "y": 70 + "y": 68 }, "targets": [ { diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index adab84b..e36aa1f 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -520,7 +520,7 @@ }, "targets": [ { - "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))", + "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))", "refId": "A", "instant": true, "format": "table" diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 19e0d4e..36c1252 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -16,20 +16,20 @@ data: { "id": 1, "type": "bargauge", - "title": "Ariadne Task Errors (24h)", + "title": "Ariadne Task Errors (range)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, - "w": 6, + "w": 8, "x": 0, "y": 0 }, "targets": [ { - "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[24h]))", + "expr": "sum by (task) (increase(ariadne_task_runs_total{status=\"error\"}[$__range]))", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -90,26 +90,31 @@ data: { "id": 2, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 7, - "w": 12, - "x": 6, + "w": 8, + "x": 8, "y": 0 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", "refId": "A", "legendFormat": "Attempts" }, { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", "refId": "B", + "legendFormat": "Warnings" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "C", "legendFormat": "Failures" } ], @@ -117,7 +122,38 @@ data: "defaults": { "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warnings" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "legend": { @@ -139,8 +175,8 @@ data: }, "gridPos": { "h": 7, - "w": 6, - "x": 18, + "w": 8, + "x": 16, "y": 0 }, "targets": [ @@ -176,7 +212,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -225,7 +262,7 @@ data: }, "targets": [ { - "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)))", + "expr": "(sum((((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})) > bool 129600) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) + count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))) or on() vector(0)", "refId": "A" } ], @@ -293,7 +330,7 @@ data: }, "targets": [ { - "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1))", + "expr": "count(((kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) unless on(namespace,cronjob) (kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)) or on() vector(0)", "refId": "A" } ], @@ -353,7 +390,7 @@ data: }, "targets": [ { - "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1)", + "expr": "sum((kube_cronjob_spec_suspend and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}) == 1) or on() vector(0)", "refId": "A" } ], @@ -586,14 +623,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 11 + "y": 17 }, "targets": [ { - "expr": "(time() - ariadne_schedule_last_error_timestamp_seconds) / 3600", + "expr": "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -624,7 +661,8 @@ data: "value": 24 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -646,7 +684,7 @@ data: "fields": [ "Value" ], - "order": "desc" + "order": "asc" } } ] @@ -660,14 +698,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 11 + "y": 17 }, "targets": [ { - "expr": "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600", + "expr": "(time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds[$__range])) / 3600", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -698,7 +736,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -734,14 +773,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 19 + "y": 23 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "expr": "(time() - max_over_time((kube_cronjob_status_last_successful_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -772,7 +811,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -808,14 +848,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 19 + "y": 23 }, "targets": [ { - "expr": "((time() - (kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"}))) / 3600", + "expr": "(time() - max_over_time((kube_cronjob_status_last_schedule_time and on(namespace,cronjob) kube_cronjob_labels{label_atlas_bstein_dev_glue=\"true\"})[$__range])) / 3600", "refId": "A", "legendFormat": "{{namespace}}/{{cronjob}}", "instant": true @@ -846,7 +886,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -882,10 +923,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 0, - "y": 27 + "y": 29 }, "targets": [ { @@ -956,10 +997,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 6, "w": 12, "x": 12, - "y": 27 + "y": 29 }, "targets": [ { @@ -1033,7 +1074,7 @@ data: "h": 6, "w": 8, "x": 0, - "y": 35 + "y": 11 }, "targets": [ { @@ -1107,7 +1148,7 @@ data: "h": 6, "w": 4, "x": 8, - "y": 35 + "y": 11 }, "targets": [ { @@ -1170,7 +1211,7 @@ data: "h": 6, "w": 12, "x": 12, - "y": 35 + "y": 11 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index f0f1982..98123b9 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -55,7 +55,7 @@ data: "unit": "none", "custom": { "displayMode": "auto", - "valueSuffix": "/19" + "valueSuffix": "/20" } }, "overrides": [] diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 45969cc..55196e8 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -458,14 +458,14 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { "min": 0, - "max": 19, + "max": 20, "thresholds": { "mode": "absolute", "steps": [ @@ -475,15 +475,15 @@ data: }, { "color": "orange", - "value": 17 - }, - { - "color": "yellow", "value": 18 }, { - "color": "green", + "color": "yellow", "value": 19 + }, + { + "color": "green", + "value": 20 } ] } @@ -805,7 +805,7 @@ data: }, "gridPos": { "h": 3, - "w": 5, + "w": 6, "x": 0, "y": 8 }, @@ -872,8 +872,8 @@ data: }, "gridPos": { "h": 3, - "w": 5, - "x": 10, + "w": 6, + "x": 12, "y": 8 }, "targets": [ @@ -977,8 +977,8 @@ data: }, "gridPos": { "h": 3, - "w": 5, - "x": 5, + "w": 6, + "x": 6, "y": 8 }, "targets": [ @@ -1053,8 +1053,8 @@ data: }, "gridPos": { "h": 3, - "w": 5, - "x": 15, + "w": 6, + "x": 18, "y": 8 }, "targets": [ @@ -1128,7 +1128,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 0, "y": 11 @@ -1203,7 +1203,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 6, "y": 11 @@ -1278,7 +1278,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 12, "y": 11 @@ -1345,7 +1345,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 18, "y": 11 @@ -1413,9 +1413,9 @@ data: }, "gridPos": { "h": 6, - "w": 4, + "w": 6, "x": 0, - "y": 16 + "y": 14 }, "targets": [ { @@ -1450,7 +1450,8 @@ data: "value": 48 } ] - } + }, + "decimals": 2 }, "overrides": [] }, @@ -1486,26 +1487,31 @@ data: { "id": 41, "type": "timeseries", - "title": "Ariadne Attempts vs Failures (1h)", + "title": "Ariadne Attempts / Warnings / Failures", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 6, - "w": 8, - "x": 4, - "y": 16 + "w": 6, + "x": 6, + "y": 14 }, "targets": [ { - "expr": "sum(increase(ariadne_task_runs_total[1h]))", + "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", "refId": "A", "legendFormat": "Attempts" }, { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[1h]))", + "expr": "sum(increase(ariadne_task_runs_total{status!~\"ok|error\"}[$__interval])) or on() vector(0)", "refId": "B", + "legendFormat": "Warnings" + }, + { + "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", + "refId": "C", "legendFormat": "Failures" } ], @@ -1513,7 +1519,38 @@ data: "defaults": { "unit": "none" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Warnings" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "yellow" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] }, "options": { "legend": { @@ -1535,9 +1572,9 @@ data: }, "gridPos": { "h": 6, - "w": 8, + "w": 6, "x": 12, - "y": 16 + "y": 14 }, "targets": [ { @@ -1571,9 +1608,9 @@ data: }, "gridPos": { "h": 6, - "w": 4, - "x": 20, - "y": 16 + "w": 6, + "x": 18, + "y": 14 }, "targets": [ { @@ -1647,7 +1684,7 @@ data: "h": 9, "w": 8, "x": 0, - "y": 22 + "y": 20 }, "targets": [ { @@ -1716,7 +1753,7 @@ data: "h": 9, "w": 8, "x": 8, - "y": 22 + "y": 20 }, "targets": [ { @@ -1785,7 +1822,7 @@ data: "h": 9, "w": 8, "x": 16, - "y": 22 + "y": 20 }, "targets": [ { @@ -1854,11 +1891,11 @@ data: "h": 12, "w": 12, "x": 0, - "y": 38 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1901,11 +1938,11 @@ data: "h": 12, "w": 12, "x": 12, - "y": 38 + "y": 36 }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-20|titan-21|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -1948,7 +1985,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 50 + "y": 48 }, "targets": [ { @@ -1985,7 +2022,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 50 + "y": 48 }, "targets": [ { @@ -2022,7 +2059,7 @@ data: "h": 10, "w": 12, "x": 0, - "y": 60 + "y": 58 }, "targets": [ { @@ -2073,7 +2110,7 @@ data: "h": 10, "w": 12, "x": 12, - "y": 60 + "y": 58 }, "targets": [ { @@ -2154,7 +2191,7 @@ data: "h": 7, "w": 8, "x": 0, - "y": 31 + "y": 29 }, "targets": [ { @@ -2198,7 +2235,7 @@ data: "h": 7, "w": 8, "x": 8, - "y": 31 + "y": 29 }, "targets": [ { @@ -2242,7 +2279,7 @@ data: "h": 7, "w": 8, "x": 16, - "y": 31 + "y": 29 }, "targets": [ { @@ -2286,7 +2323,7 @@ data: "h": 16, "w": 12, "x": 0, - "y": 70 + "y": 68 }, "targets": [ { @@ -2334,7 +2371,7 @@ data: "h": 16, "w": 12, "x": 12, - "y": 70 + "y": 68 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index f537d4c..6273023 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -529,7 +529,7 @@ data: }, "targets": [ { - "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.024)))))", + "expr": "(sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace,node) group_left() ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\" , node!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) + on(node) group_left() ((sum by (node) (kube_node_info{node=\"titan-0a\"}) * 0 + 0.001) or (sum by (node) (kube_node_info{node=\"titan-0b\"}) * 0 + 0.002) or (sum by (node) (kube_node_info{node=\"titan-0c\"}) * 0 + 0.003) or (sum by (node) (kube_node_info{node=\"titan-db\"}) * 0 + 0.004) or (sum by (node) (kube_node_info{node=\"titan-jh\"}) * 0 + 0.005) or (sum by (node) (kube_node_info{node=\"titan-04\"}) * 0 + 0.006) or (sum by (node) (kube_node_info{node=\"titan-05\"}) * 0 + 0.007) or (sum by (node) (kube_node_info{node=\"titan-06\"}) * 0 + 0.008) or (sum by (node) (kube_node_info{node=\"titan-07\"}) * 0 + 0.009000000000000001) or (sum by (node) (kube_node_info{node=\"titan-08\"}) * 0 + 0.01) or (sum by (node) (kube_node_info{node=\"titan-09\"}) * 0 + 0.011) or (sum by (node) (kube_node_info{node=\"titan-10\"}) * 0 + 0.012) or (sum by (node) (kube_node_info{node=\"titan-11\"}) * 0 + 0.013000000000000001) or (sum by (node) (kube_node_info{node=\"titan-20\"}) * 0 + 0.014) or (sum by (node) (kube_node_info{node=\"titan-21\"}) * 0 + 0.015) or (sum by (node) (kube_node_info{node=\"titan-12\"}) * 0 + 0.016) or (sum by (node) (kube_node_info{node=\"titan-13\"}) * 0 + 0.017) or (sum by (node) (kube_node_info{node=\"titan-14\"}) * 0 + 0.018000000000000002) or (sum by (node) (kube_node_info{node=\"titan-15\"}) * 0 + 0.019) or (sum by (node) (kube_node_info{node=\"titan-16\"}) * 0 + 0.02) or (sum by (node) (kube_node_info{node=\"titan-17\"}) * 0 + 0.021) or (sum by (node) (kube_node_info{node=\"titan-18\"}) * 0 + 0.022) or (sum by (node) (kube_node_info{node=\"titan-19\"}) * 0 + 0.023) or (sum by (node) (kube_node_info{node=\"titan-22\"}) * 0 + 0.024) or (sum by (node) (kube_node_info{node=\"titan-24\"}) * 0 + 0.025)))))", "refId": "A", "instant": true, "format": "table"