diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 67e486a..083ddfe 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -80,6 +80,7 @@ WORKER_TOTAL = len(WORKER_NODES) CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring" +LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" # --------------------------------------------------------------------------- # PromQL helpers @@ -149,9 +150,10 @@ CRASHLOOP_EXPR = ( '{reason=~"CrashLoopBackOff|ImagePullBackOff"}))' ) STUCK_TERMINATING_EXPR = ( - 'sum(max by (namespace,pod) ((' - '(time() - kube_pod_deletion_timestamp{pod!=""}) > 600' - ') and on(namespace,pod) kube_pod_deletion_timestamp{pod!=""} > 0))' + 'sum(max by (namespace,pod) (' + '((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)' + ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)' + '))' ) PROBLEM_TABLE_EXPR = ( @@ -168,9 +170,11 @@ CRASHLOOP_TABLE_EXPR = ( "(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" ) STUCK_TABLE_EXPR = ( + "(" "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) " - "* on(namespace,pod) group_left(node) kube_pod_info) " - "and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0" + "and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) " + "* on(namespace,pod) group_left(node) kube_pod_info" + ")" ) NAMESPACE_CPU_EXPR = ( @@ -192,6 +196,7 @@ IO_SERIES_EXPR = ( "+ rate(node_disk_written_bytes_total[5m]))" ) IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})" +TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( 'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) ' "or on() vector(0)" @@ -216,6 +221,7 @@ def stat_panel( thresholds=None, text_mode="value", legend=None, + display_name=None, value_suffix=None, links=None, ): @@ -236,6 +242,8 @@ def stat_panel( } if value_suffix: defaults["custom"]["valueSuffix"] = value_suffix + if display_name: + defaults["displayName"] = display_name panel = { "id": panel_id, "type": "stat", @@ -449,8 +457,8 @@ def build_overview(): hottest = [ (7, "Hottest node: CPU", f"topk(1, {node_cpu_expr()})", "percent"), (8, "Hottest node: RAM", f"topk(1, {node_mem_expr()})", "percent"), - (9, "Hottest node: NET", NET_TOP_EXPR, "bytes/sec"), - (10, "Hottest node: I/O", IO_TOP_EXPR, "bytes/sec"), + (9, "Hottest node: NET", NET_TOP_EXPR, "Bps"), + (10, "Hottest node: I/O", IO_TOP_EXPR, "Bps"), ] for idx, (panel_id, title, expr, unit) in enumerate(hottest): panels.append( @@ -462,7 +470,7 @@ def build_overview(): unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="value_and_name", - legend="{{node}}", + display_name="{{node}}", links=link_to("atlas-nodes"), ) ) @@ -544,7 +552,7 @@ def build_overview(): "Cluster ingress throughput", NET_INGRESS_EXPR, {"h": 7, "w": 12, "x": 0, "y": 34}, - unit="bytes/sec", + unit="Bps", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), @@ -556,7 +564,7 @@ def build_overview(): "Cluster egress throughput", NET_EGRESS_EXPR, {"h": 7, "w": 12, "x": 12, "y": 34}, - unit="bytes/sec", + unit="Bps", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), @@ -616,8 +624,8 @@ def build_overview(): storage_panels = [ (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), - (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "bytesSI"), - (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "bytesSI"), + (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), + (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), ] for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): panels.append( @@ -911,7 +919,7 @@ def build_storage_dashboard(): "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, - unit="bytesSI", + unit="decbytes", ) ) panels.append( @@ -920,14 +928,14 @@ def build_storage_dashboard(): "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, - unit="bytesSI", + unit="decbytes", ) ) panels.append( timeseries_panel( 5, "Astreae per-node usage", - filesystem_usage_expr("/mnt/astreae"), + filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 0, "y": 5}, unit="percent", legend="{{node}}", @@ -940,7 +948,7 @@ def build_storage_dashboard(): timeseries_panel( 6, "Asteria per-node usage", - filesystem_usage_expr("/mnt/asteria"), + filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 12, "y": 5}, unit="percent", legend="{{node}}", @@ -986,18 +994,19 @@ def build_storage_dashboard(): def build_network_dashboard(): panels = [] panels.append( - stat_panel(1, "Ingress bytes/s", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="bytes/sec") + stat_panel(1, "Ingress traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="Bps") ) panels.append( - stat_panel(2, "Egress bytes/s", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="bytes/sec") + stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps") ) panels.append( stat_panel( 3, "Top router req/s", - 'max(topk(1, rate(traefik_router_requests_total[5m])))', + f"topk(1, {TRAEFIK_ROUTER_EXPR})", {"h": 4, "w": 8, "x": 16, "y": 0}, unit="req/s", + display_name="{{router}}", ) ) panels.append( @@ -1006,7 +1015,7 @@ def build_network_dashboard(): "Per-node throughput", NET_SERIES_EXPR, {"h": 8, "w": 24, "x": 0, "y": 4}, - unit="bytes/sec", + unit="Bps", legend="{{node}}", legend_display="table", legend_placement="right", @@ -1019,7 +1028,7 @@ def build_network_dashboard(): 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 12}, - unit="bytes/sec", + unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) @@ -1030,7 +1039,7 @@ def build_network_dashboard(): 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', {"h": 9, "w": 12, "x": 12, "y": 12}, - unit="bytes/sec", + unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) @@ -1038,7 +1047,7 @@ def build_network_dashboard(): timeseries_panel( 7, "Traefik routers (req/s)", - 'topk(10, rate(traefik_router_requests_total[5m]))', + f"topk(10, {TRAEFIK_ROUTER_EXPR})", {"h": 9, "w": 12, "x": 0, "y": 21}, unit="req/s", legend="{{router}}", diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 3846d2a..369024f 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -7,7 +7,7 @@ { "id": 1, "type": "stat", - "title": "Ingress bytes/s", + "title": "Ingress traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -43,7 +43,7 @@ } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" } @@ -67,7 +67,7 @@ { "id": 2, "type": "stat", - "title": "Egress bytes/s", + "title": "Egress traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -103,7 +103,7 @@ } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" } @@ -140,7 +140,7 @@ }, "targets": [ { - "expr": "max(topk(1, rate(traefik_router_requests_total[5m])))", + "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A" } ], @@ -166,7 +166,8 @@ "unit": "req/s", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{router}}" }, "overrides": [] }, @@ -207,7 +208,7 @@ ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -243,7 +244,7 @@ ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -279,7 +280,7 @@ ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -309,7 +310,7 @@ }, "targets": [ { - "expr": "topk(10, rate(traefik_router_requests_total[5m]))", + "expr": "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", "legendFormat": "{{router}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 3377a13..ec7a848 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -339,7 +339,7 @@ }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "refId": "A" } ], @@ -407,8 +407,7 @@ "targets": [ { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -437,7 +436,8 @@ "unit": "percent", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -479,8 +479,7 @@ "targets": [ { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -509,7 +508,8 @@ "unit": "percent", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -551,8 +551,7 @@ "targets": [ { "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -574,10 +573,11 @@ } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -619,8 +619,7 @@ "targets": [ { "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -642,10 +641,11 @@ } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -944,7 +944,7 @@ ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -987,7 +987,7 @@ ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -1306,7 +1306,7 @@ } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } @@ -1373,7 +1373,7 @@ } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index 3e7dd0e..8494e89 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -140,7 +140,7 @@ }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "refId": "A" } ], @@ -332,7 +332,7 @@ }, "targets": [ { - "expr": "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0", + "expr": "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json index bb7d152..6585794 100644 --- a/services/monitoring/dashboards/atlas-storage.json +++ b/services/monitoring/dashboards/atlas-storage.json @@ -171,7 +171,7 @@ } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } @@ -231,7 +231,7 @@ } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } @@ -268,7 +268,7 @@ }, "targets": [ { - "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -306,7 +306,7 @@ }, "targets": [ { - "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index e1ba054..07c8b7a 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "stat", - "title": "Ingress bytes/s", + "title": "Ingress traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -52,7 +52,7 @@ data: } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" } @@ -76,7 +76,7 @@ data: { "id": 2, "type": "stat", - "title": "Egress bytes/s", + "title": "Egress traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -112,7 +112,7 @@ data: } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" } @@ -149,7 +149,7 @@ data: }, "targets": [ { - "expr": "max(topk(1, rate(traefik_router_requests_total[5m])))", + "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A" } ], @@ -175,7 +175,8 @@ data: "unit": "req/s", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{router}}" }, "overrides": [] }, @@ -216,7 +217,7 @@ data: ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -252,7 +253,7 @@ data: ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -288,7 +289,7 @@ data: ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -318,7 +319,7 @@ data: }, "targets": [ { - "expr": "topk(10, rate(traefik_router_requests_total[5m]))", + "expr": "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", "legendFormat": "{{router}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 199dfb2..bb3bb11 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -348,7 +348,7 @@ data: }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "refId": "A" } ], @@ -416,8 +416,7 @@ data: "targets": [ { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -446,7 +445,8 @@ data: "unit": "percent", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -488,8 +488,7 @@ data: "targets": [ { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -518,7 +517,8 @@ data: "unit": "percent", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -560,8 +560,7 @@ data: "targets": [ { "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -583,10 +582,11 @@ data: } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -628,8 +628,7 @@ data: "targets": [ { "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -651,10 +650,11 @@ data: } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -953,7 +953,7 @@ data: ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -996,7 +996,7 @@ data: ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -1315,7 +1315,7 @@ data: } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } @@ -1382,7 +1382,7 @@ data: } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 58cae77..e160eca 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -149,7 +149,7 @@ data: }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "refId": "A" } ], @@ -341,7 +341,7 @@ data: }, "targets": [ { - "expr": "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0", + "expr": "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml index 99439fb..1bbf1ea 100644 --- a/services/monitoring/grafana-dashboard-storage.yaml +++ b/services/monitoring/grafana-dashboard-storage.yaml @@ -180,7 +180,7 @@ data: } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } @@ -240,7 +240,7 @@ data: } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } @@ -277,7 +277,7 @@ data: }, "targets": [ { - "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -315,7 +315,7 @@ data: }, "targets": [ { - "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" }