From 8f5781d3cf22aef5018d84af6b1e592c26c830a9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 16:27:38 -0300 Subject: [PATCH] monitoring: rebuild atlas dashboards --- scripts/render_dashboards.py | 1009 +++++++++++++---- .../monitoring/dashboards/atlas-network.json | 384 +++++++ .../monitoring/dashboards/atlas-nodes.json | 212 ++-- .../monitoring/dashboards/atlas-overview.json | 872 ++++++++------ .../monitoring/dashboards/atlas-pods.json | 260 ++++- .../monitoring/dashboards/atlas-storage.json | 138 ++- .../monitoring/grafana-dashboard-network.yaml | 393 +++++++ .../monitoring/grafana-dashboard-nodes.yaml | 212 ++-- .../grafana-dashboard-overview.yaml | 872 ++++++++------ .../monitoring/grafana-dashboard-pods.yaml | 260 ++++- .../monitoring/grafana-dashboard-storage.yaml | 138 ++- services/monitoring/grafana-folders.yaml | 18 +- services/monitoring/helmrelease.yaml | 15 +- services/monitoring/kustomization.yaml | 1 + 14 files changed, 3559 insertions(+), 1225 deletions(-) mode change 100755 => 100644 scripts/render_dashboards.py create mode 100644 services/monitoring/dashboards/atlas-network.json create mode 100644 services/monitoring/grafana-dashboard-network.yaml diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py old mode 100755 new mode 100644 index fa9ef58..67e486a --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -1,15 +1,20 @@ #!/usr/bin/env python3 -"""Generate Grafana dashboards and render them into ConfigMaps. +"""Generate Atlas Grafana dashboards and render them into ConfigMaps. Usage: - python scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps - python scripts/render_dashboards.py # just render ConfigMaps + scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps + scripts/render_dashboards.py # re-render ConfigMaps from JSON """ + import argparse import json import textwrap from pathlib import Path +# --------------------------------------------------------------------------- +# Paths, folders, and shared metadata +# --------------------------------------------------------------------------- + ROOT = Path(__file__).resolve().parents[1] DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards" CONFIG_TEMPLATE = textwrap.dedent( @@ -27,15 +32,194 @@ data: ) PROM_DS = {"type": "prometheus", "uid": "atlas-vm"} +PUBLIC_FOLDER = "atlas-overview" +PRIVATE_FOLDER = "atlas-internal" + +PERCENT_THRESHOLDS = { + "mode": "percentage", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 85}, + ], +} + +# --------------------------------------------------------------------------- +# Cluster metadata +# --------------------------------------------------------------------------- + +CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"] +CONTROL_DEPENDENCIES = ["titan-db"] +CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES +WORKER_NODES = [ + "titan-04", + "titan-05", + "titan-06", + "titan-07", + "titan-08", + "titan-09", + "titan-10", + "titan-11", + "titan-12", + "titan-13", + "titan-14", + "titan-15", + "titan-16", + "titan-17", + "titan-18", + "titan-19", + "titan-22", + "titan-24", +] + +CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES) +CONTROL_ALL_REGEX = "|".join(CONTROL_ALL) +WORKER_REGEX = "|".join(WORKER_NODES) +CONTROL_TOTAL = len(CONTROL_PLANE_NODES) +WORKER_TOTAL = len(WORKER_NODES) +CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" +WORKER_SUFFIX = f"/{WORKER_TOTAL}" +CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring" + +# --------------------------------------------------------------------------- +# PromQL helpers +# --------------------------------------------------------------------------- + +NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")' -# --------------------------------------------------------------------------- # -# Panel helper factories -# --------------------------------------------------------------------------- # +def node_filter(regex): + """Return a selector that evaluates to 1 for nodes matching the regex.""" + return ( + f'label_replace(node_uname_info{{nodename=~"{regex}"}}, ' + '"node", "$1", "nodename", "(.*)")' + ) -def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None, - text_mode="value", legend=None): +def scoped_node_expr(base, scope=""): + """Attach nodename metadata and optionally filter to a scope regex.""" + expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})" + if scope: + expr = f"({expr}) * on(node) group_left() {node_filter(scope)}" + return expr + + +def node_cpu_expr(scope=""): + idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))' + base = f"(1 - {idle}) * 100" + return scoped_node_expr(base, scope) + + +def node_mem_expr(scope=""): + usage = ( + "avg by (instance) (" + "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) " + "/ node_memory_MemTotal_bytes * 100)" + ) + return scoped_node_expr(usage, scope) + + +def filesystem_usage_expr(mount, scope=""): + base = ( + f'avg by (instance) (' + f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} ' + f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)' + ) + return scoped_node_expr(base, scope) + + +def root_usage_expr(scope=""): + return filesystem_usage_expr("/", scope) + + +def astreae_usage_expr(mount): + return ( + f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / " + f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)" + ) + + +def astreae_free_expr(mount): + return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" + + +PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' +CRASHLOOP_EXPR = ( + 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' + '{reason=~"CrashLoopBackOff|ImagePullBackOff"}))' +) +STUCK_TERMINATING_EXPR = ( + 'sum(max by (namespace,pod) ((' + '(time() - kube_pod_deletion_timestamp{pod!=""}) > 600' + ') and on(namespace,pod) kube_pod_deletion_timestamp{pod!=""} > 0))' +) + +PROBLEM_TABLE_EXPR = ( + "(time() - kube_pod_created{pod!=\"\"}) " + "* on(namespace,pod) group_left(node) kube_pod_info " + "* on(namespace,pod) group_left(phase) " + "max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" +) +CRASHLOOP_TABLE_EXPR = ( + "(time() - kube_pod_created{pod!=\"\"}) " + "* on(namespace,pod) group_left(node) kube_pod_info " + "* on(namespace,pod,container) group_left(reason) " + "max by (namespace,pod,container,reason) " + "(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" +) +STUCK_TABLE_EXPR = ( + "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) " + "* on(namespace,pod) group_left(node) kube_pod_info) " + "and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0" +) + +NAMESPACE_CPU_EXPR = ( + 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=""' + ',pod!=""}[5m])) by (namespace))' +) +NAMESPACE_RAM_EXPR = ( + 'topk(10, sum(container_memory_working_set_bytes{namespace!=""' + ',pod!=""}) by (namespace))' +) +NET_SERIES_EXPR = ( + 'avg by (node) (' + 'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) ' + '+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))' +) +NET_TOP_EXPR = f"topk(1, {NET_SERIES_EXPR})" +IO_SERIES_EXPR = ( + "avg by (node) (rate(node_disk_read_bytes_total[5m]) " + "+ rate(node_disk_written_bytes_total[5m]))" +) +IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})" +NET_INGRESS_EXPR = ( + 'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) ' + "or on() vector(0)" +) +NET_EGRESS_EXPR = ( + 'sum(rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m])) ' + "or on() vector(0)" +) + +# --------------------------------------------------------------------------- +# Panel factories +# --------------------------------------------------------------------------- + + +def stat_panel( + panel_id, + title, + expr, + grid, + *, + unit="none", + thresholds=None, + text_mode="value", + legend=None, + value_suffix=None, + links=None, +): + """Return a Grafana stat panel definition.""" defaults = { "color": {"mode": "palette-classic"}, "mappings": [], @@ -48,7 +232,10 @@ def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None, ], }, "unit": unit, + "custom": {"displayMode": "auto"}, } + if value_suffix: + defaults["custom"]["valueSuffix"] = value_suffix panel = { "id": panel_id, "type": "stat", @@ -67,12 +254,26 @@ def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None, } if legend: panel["targets"][0]["legendFormat"] = legend + if links: + panel["links"] = links return panel -def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None, - legend_display="table", legend_placement="bottom", - legend_calcs=None, time_from=None): +def timeseries_panel( + panel_id, + title, + expr, + grid, + *, + unit="none", + legend=None, + legend_display="table", + legend_placement="bottom", + legend_calcs=None, + time_from=None, + links=None, +): + """Return a Grafana time-series panel definition.""" panel = { "id": panel_id, "type": "timeseries", @@ -95,11 +296,21 @@ def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None, panel["options"]["legend"]["calcs"] = legend_calcs if time_from: panel["timeFrom"] = time_from + if links: + panel["links"] = links return panel -def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=None, - description=None): +def table_panel( + panel_id, + title, + expr, + grid, + *, + unit="none", + transformations=None, +): + """Return a Grafana table panel definition.""" panel = { "id": panel_id, "type": "table", @@ -112,20 +323,25 @@ def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=Non } if transformations: panel["transformations"] = transformations - if description: - panel["description"] = description return panel def pie_panel(panel_id, title, expr, grid): + """Return a pie chart panel with readable namespace labels.""" return { "id": panel_id, "type": "piechart", "title": title, "datasource": PROM_DS, "gridPos": grid, - "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], - "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []}, + "targets": [{"expr": expr, "refId": "A"}], + "fieldConfig": { + "defaults": { + "unit": "percent", + "displayName": "{{namespace}}", + }, + "overrides": [], + }, "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", @@ -145,192 +361,238 @@ def text_panel(panel_id, title, content, grid): } -def node_cpu_expr(scope=""): - expr = "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))" - if scope: - expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}" - return expr +def link_to(uid): + return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}] -def node_mem_expr(scope=""): - expr = "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))" - if scope: - expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}" - return expr - - -def root_usage_expr(): - return "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)" - - -def astreae_usage_expr(mount): - return ( - f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / " - f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)" - ) - - -def astreae_free_expr(mount): - return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" +# --------------------------------------------------------------------------- +# Dashboard builders +# --------------------------------------------------------------------------- def build_overview(): - thresholds_percent = { - "mode": "percentage", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 70}, - {"color": "red", "value": 85}, - ], - } panels = [] - stats = [ - (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})'), - (2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})'), - (3, "Control plane ready", 'sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})'), - (4, "Control plane schedulable", 'sum(kube_node_spec_unschedulable{node=~"titan-0a|titan-0b|titan-0c"} == 0)'), - (5, "Problem pods", 'sum(kube_pod_status_phase{phase!~"Running|Succeeded"})'), - (6, "Stuck terminating", 'sum(((time() - kube_pod_deletion_timestamp) > 600))'), + + row1_stats = [ + (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None), + ( + 2, + "Ready nodes", + f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', + WORKER_SUFFIX, + WORKER_TOTAL, + None, + ), + ( + 3, + "Control plane ready", + f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', + CONTROL_SUFFIX, + CONTROL_TOTAL, + None, + ), + ( + 4, + "Control plane workloads", + f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', + None, + 1, + link_to("atlas-pods"), + ), + ( + 5, + "Problem pods", + PROBLEM_PODS_EXPR, + None, + 1, + link_to("atlas-pods"), + ), + ( + 6, + "Stuck terminating", + STUCK_TERMINATING_EXPR, + None, + 1, + link_to("atlas-pods"), + ), ] - for idx, (panel_id, title, expr) in enumerate(stats): + for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): + thresholds = None + if panel_id in (2, 3): + thresholds = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "green", "value": ok_value}, + ], + } + elif panel_id >= 4: + thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + } panels.append( stat_panel( panel_id, title, expr, {"h": 5, "w": 4, "x": 4 * idx, "y": 0}, + value_suffix=suffix, + thresholds=thresholds, + links=links, ) ) - panels.append( - stat_panel( - 7, - "Hottest node: CPU", - node_cpu_expr(), - {"h": 5, "w": 4, "x": 24, "y": 0}, - unit="percent", - thresholds=thresholds_percent, - text_mode="value_and_name", - legend="{{node}}", - ) - ) - panels.append( - stat_panel( - 8, - "Hottest node: RAM", - node_mem_expr(), - {"h": 5, "w": 4, "x": 28, "y": 0}, - unit="percent", - thresholds=thresholds_percent, - text_mode="value_and_name", - legend="{{node}}", - ) - ) - panels.append(pie_panel(9, "Namespace CPU share", 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 5})) - panels.append(pie_panel(10, "Namespace RAM share", 'topk(10, sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace))', {"h": 9, "w": 12, "x": 12, "y": 5})) + hottest = [ + (7, "Hottest node: CPU", f"topk(1, {node_cpu_expr()})", "percent"), + (8, "Hottest node: RAM", f"topk(1, {node_mem_expr()})", "percent"), + (9, "Hottest node: NET", NET_TOP_EXPR, "bytes/sec"), + (10, "Hottest node: I/O", IO_TOP_EXPR, "bytes/sec"), + ] + for idx, (panel_id, title, expr, unit) in enumerate(hottest): + panels.append( + stat_panel( + panel_id, + title, + expr, + {"h": 5, "w": 6, "x": 6 * idx, "y": 5}, + unit=unit, + thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, + text_mode="value_and_name", + legend="{{node}}", + links=link_to("atlas-nodes"), + ) + ) + + panels.append( + pie_panel( + 11, + "Namespace CPU share", + NAMESPACE_CPU_EXPR, + {"h": 9, "w": 12, "x": 0, "y": 10}, + ) + ) + panels.append( + pie_panel( + 12, + "Namespace RAM share", + NAMESPACE_RAM_EXPR, + {"h": 9, "w": 12, "x": 12, "y": 10}, + ) + ) panels.append( timeseries_panel( - 11, + 13, "Cluster node CPU", node_cpu_expr(), - {"h": 8, "w": 12, "x": 0, "y": 14}, + {"h": 8, "w": 12, "x": 0, "y": 19}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", + links=link_to("atlas-nodes"), ) ) panels.append( timeseries_panel( - 12, + 14, "Cluster node RAM", node_mem_expr(), - {"h": 8, "w": 12, "x": 12, "y": 14}, + {"h": 8, "w": 12, "x": 12, "y": 19}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", - ) - ) - - panels.append( - table_panel( - 13, - "Problem pods (details)", - "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", - {"h": 8, "w": 12, "x": 0, "y": 22}, - unit="s", - transformations=[{"id": "labelsToFields", "options": {}}], - ) - ) - panels.append( - table_panel( - 14, - "Terminating >10m", - "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", - {"h": 8, "w": 12, "x": 12, "y": 22}, - unit="s", - transformations=[ - {"id": "labelsToFields", "options": {}} , - {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, - ], + links=link_to("atlas-nodes"), ) ) panels.append( timeseries_panel( 15, - "Control plane CPU", - node_cpu_expr("titan-0a|titan-0b|titan-0c"), - {"h": 7, "w": 12, "x": 0, "y": 30}, + "Control plane CPU (incl. titan-db)", + node_cpu_expr(CONTROL_ALL_REGEX), + {"h": 7, "w": 12, "x": 0, "y": 27}, unit="percent", legend="{{node}}", + legend_display="table", + legend_placement="right", ) ) panels.append( timeseries_panel( 16, - "Control plane RAM", - node_mem_expr("titan-0a|titan-0b|titan-0c"), - {"h": 7, "w": 12, "x": 12, "y": 30}, + "Control plane RAM (incl. titan-db)", + node_mem_expr(CONTROL_ALL_REGEX), + {"h": 7, "w": 12, "x": 12, "y": 27}, unit="percent", legend="{{node}}", + legend_display="table", + legend_placement="right", ) ) panels.append( timeseries_panel( 17, + "Cluster ingress throughput", + NET_INGRESS_EXPR, + {"h": 7, "w": 12, "x": 0, "y": 34}, + unit="bytes/sec", + legend_display="list", + legend_placement="bottom", + links=link_to("atlas-network"), + ) + ) + panels.append( + timeseries_panel( + 18, + "Cluster egress throughput", + NET_EGRESS_EXPR, + {"h": 7, "w": 12, "x": 12, "y": 34}, + unit="bytes/sec", + legend_display="list", + legend_placement="bottom", + links=link_to("atlas-network"), + ) + ) + + panels.append( + timeseries_panel( + 19, "Root filesystem usage", root_usage_expr(), - {"h": 8, "w": 12, "x": 0, "y": 37}, + {"h": 8, "w": 12, "x": 0, "y": 41}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", - time_from="7d", + time_from="30d", + links=link_to("atlas-storage"), ) ) - panels.append( { - "id": 18, + "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": PROM_DS, - "gridPos": {"h": 8, "w": 12, "x": 12, "y": 37}, - "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}], + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 41}, + "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A"}], "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, @@ -338,6 +600,7 @@ def build_overview(): {"color": "red", "value": 85}, ], }, + "displayName": "{{node}}", }, "overrides": [], }, @@ -346,143 +609,157 @@ def build_overview(): "orientation": "horizontal", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, + "links": link_to("atlas-storage"), } ) - panels.append( - stat_panel( - 19, - "Astreae usage", - astreae_usage_expr("/mnt/astreae"), - {"h": 6, "w": 6, "x": 0, "y": 45}, - unit="percent", - thresholds=thresholds_percent, + storage_panels = [ + (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), + (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), + (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "bytesSI"), + (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "bytesSI"), + ] + for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): + panels.append( + stat_panel( + panel_id, + title, + expr, + {"h": 6, "w": 6, "x": 6 * idx, "y": 49}, + unit=unit, + thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, + links=link_to("atlas-storage"), + ) ) - ) - panels.append( - stat_panel( - 20, - "Asteria usage", - astreae_usage_expr("/mnt/asteria"), - {"h": 6, "w": 6, "x": 6, "y": 45}, - unit="percent", - thresholds=thresholds_percent, - ) - ) - panels.append( - stat_panel( - 21, - "Astreae free", - astreae_free_expr("/mnt/astreae"), - {"h": 6, "w": 6, "x": 12, "y": 45}, - unit="bytesSI", - ) - ) - panels.append( - stat_panel( - 22, - "Asteria free", - astreae_free_expr("/mnt/asteria"), - {"h": 6, "w": 6, "x": 18, "y": 45}, - unit="bytesSI", - ) - ) - - panels.append( - table_panel( - 23, - "Astreae per-node usage", - '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', - {"h": 8, "w": 12, "x": 0, "y": 51}, - unit="percent", - transformations=[{"id": "labelsToFields", "options": {}}], - ) - ) - panels.append( - table_panel( - 24, - "Asteria per-node usage", - '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', - {"h": 8, "w": 12, "x": 12, "y": 51}, - unit="percent", - transformations=[{"id": "labelsToFields", "options": {}}], - ) - ) panels.append( text_panel( 25, "About this dashboard", - "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders", - {"h": 5, "w": 24, "x": 0, "y": 59}, + textwrap.dedent( + """\ +### Atlas Overview +- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs. +- Control plane workload count flags any non-system pods that slipped onto the HA nodes. +- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly.""" + ), + {"h": 5, "w": 24, "x": 0, "y": 55}, ) ) return { "uid": "atlas-overview", "title": "Atlas Overview", - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": {"type": "datasource", "uid": "grafana"}, - "enable": True, - "hide": True, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard", - } - ] - }, + "folderUid": PUBLIC_FOLDER, "editable": False, - "folderUid": "atlas-overview", - "graphTooltip": 0, - "links": [ - {"title": "Pods dashboard", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False}, - {"title": "Nodes dashboard", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False}, - {"title": "Storage dashboard", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False}, - ], + "annotations": {"list": []}, "panels": panels, - "refresh": "30s", "schemaVersion": 39, "style": "dark", "tags": ["atlas", "overview"], "templating": {"list": []}, "time": {"from": "now-12h", "to": "now"}, + "links": [ + {"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False}, + {"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False}, + {"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False}, + {"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False}, + ], } def build_pods_dashboard(): panels = [] panels.append( - table_panel( + stat_panel( 1, - "Pods not running", - "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", - {"h": 10, "w": 24, "x": 0, "y": 0}, - unit="s", - transformations=[{"id": "labelsToFields", "options": {}}], + "Problem pods", + PROBLEM_PODS_EXPR, + {"h": 4, "w": 6, "x": 0, "y": 0}, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + }, ) ) panels.append( - table_panel( + stat_panel( 2, "CrashLoop / ImagePull", - "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", - {"h": 10, "w": 24, "x": 0, "y": 10}, + CRASHLOOP_EXPR, + {"h": 4, "w": 6, "x": 6, "y": 0}, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + }, + ) + ) + panels.append( + stat_panel( + 3, + "Stuck terminating (>10m)", + STUCK_TERMINATING_EXPR, + {"h": 4, "w": 6, "x": 12, "y": 0}, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + }, + ) + ) + panels.append( + stat_panel( + 4, + "Control plane workloads", + f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', + {"h": 4, "w": 6, "x": 18, "y": 0}, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + }, + ) + ) + + panels.append( + table_panel( + 5, + "Pods not running", + PROBLEM_TABLE_EXPR, + {"h": 10, "w": 24, "x": 0, "y": 4}, unit="s", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( - 3, - "Terminating pods", - "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", - {"h": 10, "w": 24, "x": 0, "y": 20}, + 6, + "CrashLoop / ImagePull", + CRASHLOOP_TABLE_EXPR, + {"h": 10, "w": 24, "x": 0, "y": 14}, + unit="s", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + table_panel( + 7, + "Terminating >10m", + STUCK_TABLE_EXPR, + {"h": 10, "w": 24, "x": 0, "y": 24}, unit="s", transformations=[ - {"id": "labelsToFields", "options": {}} , + {"id": "labelsToFields", "options": {}}, {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, ], ) @@ -490,7 +767,7 @@ def build_pods_dashboard(): return { "uid": "atlas-pods", "title": "Atlas Pods", - "folderUid": "atlas-pods", + "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, @@ -503,17 +780,99 @@ def build_pods_dashboard(): def build_nodes_dashboard(): panels = [] - panels.append(stat_panel(1, "Node count", 'count(kube_node_info)', {"h": 5, "w": 6, "x": 0, "y": 0})) - panels.append(stat_panel(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})', {"h": 5, "w": 6, "x": 6, "y": 0})) - panels.append(stat_panel(3, "Control plane CPU avg", node_cpu_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name")) - panels.append(stat_panel(4, "Control plane RAM avg", node_mem_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name")) - panels.append(timeseries_panel(5, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right")) - panels.append(timeseries_panel(6, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 14}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right")) - panels.append(timeseries_panel(7, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 23}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="7d")) + panels.append( + stat_panel( + 1, + "Worker nodes ready", + f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', + {"h": 4, "w": 8, "x": 0, "y": 0}, + value_suffix=WORKER_SUFFIX, + ) + ) + panels.append( + stat_panel( + 2, + "Control plane ready", + f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', + {"h": 4, "w": 8, "x": 8, "y": 0}, + value_suffix=CONTROL_SUFFIX, + ) + ) + panels.append( + stat_panel( + 3, + "Control plane workloads", + f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', + {"h": 4, "w": 8, "x": 16, "y": 0}, + ) + ) + panels.append( + timeseries_panel( + 4, + "Node CPU", + node_cpu_expr(), + {"h": 9, "w": 24, "x": 0, "y": 4}, + unit="percent", + legend="{{node}}", + legend_calcs=["last"], + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 5, + "Node RAM", + node_mem_expr(), + {"h": 9, "w": 24, "x": 0, "y": 13}, + unit="percent", + legend="{{node}}", + legend_calcs=["last"], + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 6, + "Control plane (incl. titan-db) CPU", + node_cpu_expr(CONTROL_ALL_REGEX), + {"h": 9, "w": 12, "x": 0, "y": 22}, + unit="percent", + legend="{{node}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 7, + "Control plane (incl. titan-db) RAM", + node_mem_expr(CONTROL_ALL_REGEX), + {"h": 9, "w": 12, "x": 12, "y": 22}, + unit="percent", + legend="{{node}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 8, + "Root filesystem usage", + root_usage_expr(), + {"h": 9, "w": 24, "x": 0, "y": 31}, + unit="percent", + legend="{{node}}", + legend_display="table", + legend_placement="right", + time_from="30d", + ) + ) return { "uid": "atlas-nodes", "title": "Atlas Nodes", - "folderUid": "atlas-nodes", + "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, @@ -526,17 +885,94 @@ def build_nodes_dashboard(): def build_storage_dashboard(): panels = [] - panels.append(stat_panel(1, "Astreae usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent")) - panels.append(stat_panel(2, "Asteria usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent")) - panels.append(stat_panel(3, "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="bytesSI")) - panels.append(stat_panel(4, "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="bytesSI")) - panels.append(timeseries_panel(5, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d")) - panels.append(table_panel(6, "Astreae nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 0, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}])) - panels.append(table_panel(7, "Asteria nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 12, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}])) + panels.append( + stat_panel( + 1, + "Astreae usage", + astreae_usage_expr("/mnt/astreae"), + {"h": 5, "w": 6, "x": 0, "y": 0}, + unit="percent", + thresholds=PERCENT_THRESHOLDS, + ) + ) + panels.append( + stat_panel( + 2, + "Asteria usage", + astreae_usage_expr("/mnt/asteria"), + {"h": 5, "w": 6, "x": 6, "y": 0}, + unit="percent", + thresholds=PERCENT_THRESHOLDS, + ) + ) + panels.append( + stat_panel( + 3, + "Astreae free", + astreae_free_expr("/mnt/astreae"), + {"h": 5, "w": 6, "x": 12, "y": 0}, + unit="bytesSI", + ) + ) + panels.append( + stat_panel( + 4, + "Asteria free", + astreae_free_expr("/mnt/asteria"), + {"h": 5, "w": 6, "x": 18, "y": 0}, + unit="bytesSI", + ) + ) + panels.append( + timeseries_panel( + 5, + "Astreae per-node usage", + filesystem_usage_expr("/mnt/astreae"), + {"h": 9, "w": 12, "x": 0, "y": 5}, + unit="percent", + legend="{{node}}", + legend_display="table", + legend_placement="right", + time_from="30d", + ) + ) + panels.append( + timeseries_panel( + 6, + "Asteria per-node usage", + filesystem_usage_expr("/mnt/asteria"), + {"h": 9, "w": 12, "x": 12, "y": 5}, + unit="percent", + legend="{{node}}", + legend_display="table", + legend_placement="right", + time_from="30d", + ) + ) + panels.append( + timeseries_panel( + 7, + "Astreae usage history", + astreae_usage_expr("/mnt/astreae"), + {"h": 9, "w": 12, "x": 0, "y": 14}, + unit="percent", + time_from="90d", + ) + ) + panels.append( + timeseries_panel( + 8, + "Asteria usage history", + astreae_usage_expr("/mnt/asteria"), + {"h": 9, "w": 12, "x": 12, "y": 14}, + unit="percent", + time_from="90d", + ) + ) return { "uid": "atlas-storage", "title": "Atlas Storage", - "folderUid": "atlas-storage", + "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, @@ -547,6 +983,95 @@ def build_storage_dashboard(): } +def build_network_dashboard(): + panels = [] + panels.append( + stat_panel(1, "Ingress bytes/s", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="bytes/sec") + ) + panels.append( + stat_panel(2, "Egress bytes/s", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="bytes/sec") + ) + panels.append( + stat_panel( + 3, + "Top router req/s", + 'max(topk(1, rate(traefik_router_requests_total[5m])))', + {"h": 4, "w": 8, "x": 16, "y": 0}, + unit="req/s", + ) + ) + panels.append( + timeseries_panel( + 4, + "Per-node throughput", + NET_SERIES_EXPR, + {"h": 8, "w": 24, "x": 0, "y": 4}, + unit="bytes/sec", + legend="{{node}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + table_panel( + 5, + "Top namespaces", + 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' + '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', + {"h": 9, "w": 12, "x": 0, "y": 12}, + unit="bytes/sec", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + table_panel( + 6, + "Top pods", + 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' + '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', + {"h": 9, "w": 12, "x": 12, "y": 12}, + unit="bytes/sec", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + timeseries_panel( + 7, + "Traefik routers (req/s)", + 'topk(10, rate(traefik_router_requests_total[5m]))', + {"h": 9, "w": 12, "x": 0, "y": 21}, + unit="req/s", + legend="{{router}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 8, + "Traefik entrypoints (req/s)", + 'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))', + {"h": 9, "w": 12, "x": 12, "y": 21}, + unit="req/s", + legend="{{entrypoint}}", + legend_display="table", + legend_placement="right", + ) + ) + return { + "uid": "atlas-network", + "title": "Atlas Network", + "folderUid": PRIVATE_FOLDER, + "editable": True, + "panels": panels, + "time": {"from": "now-12h", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "network"], + } + + DASHBOARDS = { "atlas-overview": { "builder": build_overview, @@ -564,20 +1089,24 @@ DASHBOARDS = { "builder": build_storage_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml", }, + "atlas-network": { + "builder": build_network_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml", + }, } -def write_json(uid: str, data: dict) -> None: +def write_json(uid, data): DASHBOARD_DIR.mkdir(parents=True, exist_ok=True) path = DASHBOARD_DIR / f"{uid}.json" path.write_text(json.dumps(data, indent=2) + "\n") -def render_configmap(uid: str, data: dict) -> None: +def render_configmap(uid, info): json_path = DASHBOARD_DIR / f"{uid}.json" payload = json.dumps(json.loads(json_path.read_text()), indent=2) indented = "\n".join(" " + line for line in payload.splitlines()) - output_path = data["configmap"] + output_path = info["configmap"] content = CONFIG_TEMPLATE.format( relative_path=output_path.relative_to(ROOT), name=output_path.stem, diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json new file mode 100644 index 0000000..3846d2a --- /dev/null +++ b/services/monitoring/dashboards/atlas-network.json @@ -0,0 +1,384 @@ +{ + "uid": "atlas-network", + "title": "Atlas Network", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Ingress bytes/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Egress bytes/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 0 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Top router req/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 0 + }, + "targets": [ + { + "expr": "max(topk(1, rate(traefik_router_requests_total[5m])))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "req/s", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "timeseries", + "title": "Per-node throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 4 + }, + "targets": [ + { + "expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 5, + "type": "table", + "title": "Top namespaces", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 6, + "type": "table", + "title": "Top pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 12 + }, + "targets": [ + { + "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 7, + "type": "timeseries", + "title": "Traefik routers (req/s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 21 + }, + "targets": [ + { + "expr": "topk(10, rate(traefik_router_requests_total[5m]))", + "refId": "A", + "legendFormat": "{{router}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "req/s" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 8, + "type": "timeseries", + "title": "Traefik entrypoints (req/s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 21 + }, + "targets": [ + { + "expr": "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))", + "refId": "A", + "legendFormat": "{{entrypoint}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "req/s" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "network" + ] +} diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index d3393a9..e974d8a 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -1,26 +1,26 @@ { "uid": "atlas-nodes", "title": "Atlas Nodes", - "folderUid": "atlas-nodes", + "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, "type": "stat", - "title": "Node count", + "title": "Worker nodes ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 6, + "h": 4, + "w": 8, "x": 0, "y": 0 }, "targets": [ { - "expr": "count(kube_node_info)", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -43,7 +43,11 @@ } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/18" + } }, "overrides": [] }, @@ -64,20 +68,20 @@ { "id": 2, "type": "stat", - "title": "Ready nodes", + "title": "Control plane ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 6, - "x": 6, + "h": 4, + "w": 8, + "x": 8, "y": 0 }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})", "refId": "A" } ], @@ -100,7 +104,11 @@ } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/3" + } }, "overrides": [] }, @@ -121,22 +129,21 @@ { "id": 3, "type": "stat", - "title": "Control plane CPU avg", + "title": "Control plane workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 6, - "x": 12, + "h": 4, + "w": 8, + "x": 16, "y": 0 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", - "refId": "A", - "legendFormat": "{{node}}" + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "refId": "A" } ], "fieldConfig": { @@ -158,7 +165,10 @@ } ] }, - "unit": "percent" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -173,69 +183,11 @@ "fields": "", "values": false }, - "textMode": "value_and_name" + "textMode": "value" } }, { "id": 4, - "type": "stat", - "title": "Control plane RAM avg", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 6, - "x": 18, - "y": 0 - }, - "targets": [ - { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value_and_name" - } - }, - { - "id": 5, "type": "timeseries", "title": "Node CPU", "datasource": { @@ -246,11 +198,51 @@ "h": 9, "w": 24, "x": 0, - "y": 5 + "y": 4 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 5, + "type": "timeseries", + "title": "Node RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 13 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -277,20 +269,20 @@ { "id": 6, "type": "timeseries", - "title": "Node RAM", + "title": "Control plane (incl. titan-db) CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, - "y": 14 + "y": 22 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -304,10 +296,7 @@ "options": { "legend": { "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] + "placement": "right" }, "tooltip": { "mode": "multi" @@ -317,7 +306,44 @@ { "id": 7, "type": "timeseries", - "title": "Root filesystem", + "title": "Control plane (incl. titan-db) RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 22 + }, + "targets": [ + { + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 8, + "type": "timeseries", + "title": "Root filesystem usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -326,11 +352,11 @@ "h": 9, "w": 24, "x": 0, - "y": 23 + "y": 31 }, "targets": [ { - "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -350,7 +376,7 @@ "mode": "multi" } }, - "timeFrom": "7d" + "timeFrom": "30d" } ], "time": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index d7a0d27..3377a13 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1,45 +1,11 @@ { "uid": "atlas-overview", "title": "Atlas Overview", - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": false, "folderUid": "atlas-overview", - "graphTooltip": 0, - "links": [ - { - "title": "Pods dashboard", - "type": "dashboard", - "dashboardUid": "atlas-pods", - "keepTime": false - }, - { - "title": "Nodes dashboard", - "type": "dashboard", - "dashboardUid": "atlas-nodes", - "keepTime": false - }, - { - "title": "Storage dashboard", - "type": "dashboard", - "dashboardUid": "atlas-storage", - "keepTime": false - } - ], + "editable": false, + "annotations": { + "list": [] + }, "panels": [ { "id": 1, @@ -80,7 +46,10 @@ } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -114,7 +83,7 @@ }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -128,16 +97,20 @@ "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "red", "value": null }, { "color": "green", - "value": 1 + "value": 18 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/18" + } }, "overrides": [] }, @@ -185,16 +158,20 @@ "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "red", "value": null }, { "color": "green", - "value": 1 + "value": 3 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/3" + } }, "overrides": [] }, @@ -215,7 +192,7 @@ { "id": 4, "type": "stat", - "title": "Control plane schedulable", + "title": "Control plane workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -228,7 +205,7 @@ }, "targets": [ { - "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", "refId": "A" } ], @@ -242,16 +219,19 @@ "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", + "color": "red", "value": 1 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -267,7 +247,14 @@ "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] }, { "id": 5, @@ -285,7 +272,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", "refId": "A" } ], @@ -299,16 +286,19 @@ "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", + "color": "red", "value": 1 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -324,7 +314,14 @@ "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] }, { "id": 6, @@ -342,10 +339,222 @@ }, "targets": [ { - "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", "refId": "A" } ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] + }, + { + "id": 7, + "type": "stat", + "title": "Hottest node: CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 5 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 8, + "type": "stat", + "title": "Hottest node: RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 5 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 9, + "type": "stat", + "title": "Hottest node: NET", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 5 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], "fieldConfig": { "defaults": { "color": { @@ -365,69 +574,10 @@ } ] }, - "unit": "none" - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 7, - "type": "stat", - "title": "Hottest node: CPU", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 24, - "y": 0 - }, - "targets": [ - { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "percentage", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 70 - }, - { - "color": "red", - "value": 85 - } - ] - }, - "unit": "percent" + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -443,25 +593,32 @@ "values": false }, "textMode": "value_and_name" - } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] }, { - "id": 8, + "id": 10, "type": "stat", - "title": "Hottest node: RAM", + "title": "Hottest node: I/O", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, - "w": 4, - "x": 28, - "y": 0 + "w": 6, + "x": 18, + "y": 5 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", "legendFormat": "{{node}}" } @@ -473,23 +630,22 @@ }, "mappings": [], "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", - "value": 70 - }, - { - "color": "red", - "value": 85 + "color": "green", + "value": 1 } ] }, - "unit": "percent" + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -505,10 +661,17 @@ "values": false }, "textMode": "value_and_name" - } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] }, { - "id": 9, + "id": 11, "type": "piechart", "title": "Namespace CPU share", "datasource": { @@ -519,18 +682,18 @@ "h": 9, "w": 12, "x": 0, - "y": 5 + "y": 10 }, "targets": [ { - "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", - "refId": "A", - "legendFormat": "{{namespace}}" + "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "displayName": "{{namespace}}" }, "overrides": [] }, @@ -550,7 +713,7 @@ } }, { - "id": 10, + "id": 12, "type": "piechart", "title": "Namespace RAM share", "datasource": { @@ -561,18 +724,18 @@ "h": 9, "w": 12, "x": 12, - "y": 5 + "y": 10 }, "targets": [ { - "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", - "refId": "A", - "legendFormat": "{{namespace}}" + "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "displayName": "{{namespace}}" }, "overrides": [] }, @@ -592,7 +755,7 @@ } }, { - "id": 11, + "id": 13, "type": "timeseries", "title": "Cluster node CPU", "datasource": { @@ -603,11 +766,11 @@ "h": 8, "w": 12, "x": 0, - "y": 14 + "y": 19 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -629,10 +792,17 @@ "tooltip": { "mode": "multi" } - } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] }, { - "id": 12, + "id": 14, "type": "timeseries", "title": "Cluster node RAM", "datasource": { @@ -643,11 +813,11 @@ "h": 8, "w": 12, "x": 12, - "y": 14 + "y": 19 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -669,92 +839,19 @@ "tooltip": { "mode": "multi" } - } - }, - { - "id": 13, - "type": "table", - "title": "Problem pods (details)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 22 - }, - "targets": [ + "links": [ { - "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "s" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } - ] - }, - { - "id": 14, - "type": "table", - "title": "Terminating >10m", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 22 - }, - "targets": [ - { - "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "s" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "filterByValue", - "options": { - "match": "Value", - "operator": "gt", - "value": 600 - } + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true } ] }, { "id": 15, "type": "timeseries", - "title": "Control plane CPU", + "title": "Control plane CPU (incl. titan-db)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -763,11 +860,11 @@ "h": 7, "w": 12, "x": 0, - "y": 30 + "y": 27 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -781,7 +878,7 @@ "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right" }, "tooltip": { "mode": "multi" @@ -791,7 +888,7 @@ { "id": 16, "type": "timeseries", - "title": "Control plane RAM", + "title": "Control plane RAM (incl. titan-db)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -800,11 +897,11 @@ "h": 7, "w": 12, "x": 12, - "y": 30 + "y": 27 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -818,7 +915,7 @@ "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right" }, "tooltip": { "mode": "multi" @@ -828,6 +925,92 @@ { "id": 17, "type": "timeseries", + "title": "Cluster ingress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 34 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 18, + "type": "timeseries", + "title": "Cluster egress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 34 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 19, + "type": "timeseries", "title": "Root filesystem usage", "datasource": { "type": "prometheus", @@ -837,11 +1020,11 @@ "h": 8, "w": 12, "x": 0, - "y": 37 + "y": 41 }, "targets": [ { - "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -864,10 +1047,17 @@ "mode": "multi" } }, - "timeFrom": "7d" + "timeFrom": "30d", + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 18, + "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -878,13 +1068,12 @@ "h": 8, "w": 12, "x": 12, - "y": 37 + "y": 41 }, "targets": [ { - "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))", - "refId": "A", - "legendFormat": "{{node}}" + "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A" } ], "fieldConfig": { @@ -893,7 +1082,7 @@ "min": 0, "max": 100, "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { "color": "green", @@ -912,7 +1101,8 @@ "value": 85 } ] - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -926,10 +1116,17 @@ "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 19, + "id": 21, "type": "stat", "title": "Astreae usage", "datasource": { @@ -940,7 +1137,7 @@ "h": 6, "w": 6, "x": 0, - "y": 45 + "y": 49 }, "targets": [ { @@ -971,7 +1168,10 @@ } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -987,10 +1187,17 @@ "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 20, + "id": 22, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1001,7 +1208,7 @@ "h": 6, "w": 6, "x": 6, - "y": 45 + "y": 49 }, "targets": [ { @@ -1032,7 +1239,10 @@ } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -1048,10 +1258,17 @@ "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 21, + "id": 23, "type": "stat", "title": "Astreae free", "datasource": { @@ -1062,7 +1279,7 @@ "h": 6, "w": 6, "x": 12, - "y": 45 + "y": 49 }, "targets": [ { @@ -1089,7 +1306,10 @@ } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -1105,10 +1325,17 @@ "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 22, + "id": 24, "type": "stat", "title": "Asteria free", "datasource": { @@ -1119,7 +1346,7 @@ "h": 6, "w": 6, "x": 18, - "y": 45 + "y": 49 }, "targets": [ { @@ -1146,7 +1373,10 @@ } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -1162,77 +1392,12 @@ "values": false }, "textMode": "value" - } - }, - { - "id": 23, - "type": "table", - "title": "Astreae per-node usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 51 - }, - "targets": [ + "links": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } - ] - }, - { - "id": 24, - "type": "table", - "title": "Asteria per-node usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 51 - }, - "targets": [ - { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true } ] }, @@ -1244,16 +1409,15 @@ "h": 5, "w": 24, "x": 0, - "y": 59 + "y": 55 }, "datasource": null, "options": { "mode": "markdown", - "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders" + "content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly." } } ], - "refresh": "30s", "schemaVersion": 39, "style": "dark", "tags": [ @@ -1266,5 +1430,31 @@ "time": { "from": "now-12h", "to": "now" - } + }, + "links": [ + { + "title": "Atlas Pods", + "type": "dashboard", + "dashboardUid": "atlas-pods", + "keepTime": false + }, + { + "title": "Atlas Nodes", + "type": "dashboard", + "dashboardUid": "atlas-nodes", + "keepTime": false + }, + { + "title": "Atlas Storage", + "type": "dashboard", + "dashboardUid": "atlas-storage", + "keepTime": false + }, + { + "title": "Atlas Network", + "type": "dashboard", + "dashboardUid": "atlas-network", + "keepTime": false + } + ] } diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index 91f80eb..3e7dd0e 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -1,11 +1,251 @@ { "uid": "atlas-pods", "title": "Atlas Pods", - "folderUid": "atlas-pods", + "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, + "type": "stat", + "title": "Problem pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "CrashLoop / ImagePull", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Stuck terminating (>10m)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Control plane workloads", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, "type": "table", "title": "Pods not running", "datasource": { @@ -16,11 +256,11 @@ "h": 10, "w": 24, "x": 0, - "y": 0 + "y": 4 }, "targets": [ { - "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", "refId": "A" } ], @@ -41,7 +281,7 @@ ] }, { - "id": 2, + "id": 6, "type": "table", "title": "CrashLoop / ImagePull", "datasource": { @@ -52,11 +292,11 @@ "h": 10, "w": 24, "x": 0, - "y": 10 + "y": 14 }, "targets": [ { - "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", + "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", "refId": "A" } ], @@ -77,9 +317,9 @@ ] }, { - "id": 3, + "id": 7, "type": "table", - "title": "Terminating pods", + "title": "Terminating >10m", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -88,11 +328,11 @@ "h": 10, "w": 24, "x": 0, - "y": 20 + "y": 24 }, "targets": [ { - "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + "expr": "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json index aa1948d..bb7d152 100644 --- a/services/monitoring/dashboards/atlas-storage.json +++ b/services/monitoring/dashboards/atlas-storage.json @@ -1,7 +1,7 @@ { "uid": "atlas-storage", "title": "Atlas Storage", - "folderUid": "atlas-storage", + "folderUid": "atlas-internal", "editable": true, "panels": [ { @@ -31,19 +31,26 @@ }, "mappings": [], "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -88,19 +95,26 @@ }, "mappings": [], "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -157,7 +171,10 @@ } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -214,7 +231,10 @@ } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -235,20 +255,20 @@ { "id": 5, "type": "timeseries", - "title": "Root filesystem", + "title": "Astreae per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, "y": 5 }, "targets": [ { - "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -272,21 +292,59 @@ }, { "id": 6, - "type": "table", - "title": "Astreae nodes", + "type": "timeseries", + "title": "Asteria per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 10, + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d" + }, + { + "id": 7, + "type": "timeseries", + "title": "Astreae usage history", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, "w": 12, "x": 0, "y": 14 }, "targets": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], @@ -297,32 +355,33 @@ "overrides": [] }, "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" } - ] + }, + "timeFrom": "90d" }, { - "id": 7, - "type": "table", - "title": "Asteria nodes", + "id": 8, + "type": "timeseries", + "title": "Asteria usage history", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 10, + "h": 9, "w": 12, "x": 12, "y": 14 }, "targets": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], @@ -333,14 +392,15 @@ "overrides": [] }, "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" } - ] + }, + "timeFrom": "90d" } ], "time": { diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml new file mode 100644 index 0000000..e1ba054 --- /dev/null +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -0,0 +1,393 @@ +# services/monitoring/grafana-dashboard-network.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-network + labels: + grafana_dashboard: "1" +data: + atlas-network.json: | + { + "uid": "atlas-network", + "title": "Atlas Network", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Ingress bytes/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Egress bytes/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 0 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Top router req/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 0 + }, + "targets": [ + { + "expr": "max(topk(1, rate(traefik_router_requests_total[5m])))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "req/s", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "timeseries", + "title": "Per-node throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 4 + }, + "targets": [ + { + "expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 5, + "type": "table", + "title": "Top namespaces", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 6, + "type": "table", + "title": "Top pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 12 + }, + "targets": [ + { + "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 7, + "type": "timeseries", + "title": "Traefik routers (req/s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 21 + }, + "targets": [ + { + "expr": "topk(10, rate(traefik_router_requests_total[5m]))", + "refId": "A", + "legendFormat": "{{router}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "req/s" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 8, + "type": "timeseries", + "title": "Traefik entrypoints (req/s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 21 + }, + "targets": [ + { + "expr": "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))", + "refId": "A", + "legendFormat": "{{entrypoint}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "req/s" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "network" + ] + } diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index 516f207..afbeb3c 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -10,26 +10,26 @@ data: { "uid": "atlas-nodes", "title": "Atlas Nodes", - "folderUid": "atlas-nodes", + "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, "type": "stat", - "title": "Node count", + "title": "Worker nodes ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 6, + "h": 4, + "w": 8, "x": 0, "y": 0 }, "targets": [ { - "expr": "count(kube_node_info)", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -52,7 +52,11 @@ data: } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/18" + } }, "overrides": [] }, @@ -73,20 +77,20 @@ data: { "id": 2, "type": "stat", - "title": "Ready nodes", + "title": "Control plane ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 6, - "x": 6, + "h": 4, + "w": 8, + "x": 8, "y": 0 }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})", "refId": "A" } ], @@ -109,7 +113,11 @@ data: } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/3" + } }, "overrides": [] }, @@ -130,22 +138,21 @@ data: { "id": 3, "type": "stat", - "title": "Control plane CPU avg", + "title": "Control plane workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 6, - "x": 12, + "h": 4, + "w": 8, + "x": 16, "y": 0 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", - "refId": "A", - "legendFormat": "{{node}}" + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "refId": "A" } ], "fieldConfig": { @@ -167,7 +174,10 @@ data: } ] }, - "unit": "percent" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -182,69 +192,11 @@ data: "fields": "", "values": false }, - "textMode": "value_and_name" + "textMode": "value" } }, { "id": 4, - "type": "stat", - "title": "Control plane RAM avg", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 6, - "x": 18, - "y": 0 - }, - "targets": [ - { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value_and_name" - } - }, - { - "id": 5, "type": "timeseries", "title": "Node CPU", "datasource": { @@ -255,11 +207,51 @@ data: "h": 9, "w": 24, "x": 0, - "y": 5 + "y": 4 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 5, + "type": "timeseries", + "title": "Node RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 13 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -286,20 +278,20 @@ data: { "id": 6, "type": "timeseries", - "title": "Node RAM", + "title": "Control plane (incl. titan-db) CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, - "y": 14 + "y": 22 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -313,10 +305,7 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] + "placement": "right" }, "tooltip": { "mode": "multi" @@ -326,7 +315,44 @@ data: { "id": 7, "type": "timeseries", - "title": "Root filesystem", + "title": "Control plane (incl. titan-db) RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 22 + }, + "targets": [ + { + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 8, + "type": "timeseries", + "title": "Root filesystem usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -335,11 +361,11 @@ data: "h": 9, "w": 24, "x": 0, - "y": 23 + "y": 31 }, "targets": [ { - "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -359,7 +385,7 @@ data: "mode": "multi" } }, - "timeFrom": "7d" + "timeFrom": "30d" } ], "time": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index a20e05a..199dfb2 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -10,45 +10,11 @@ data: { "uid": "atlas-overview", "title": "Atlas Overview", - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": false, "folderUid": "atlas-overview", - "graphTooltip": 0, - "links": [ - { - "title": "Pods dashboard", - "type": "dashboard", - "dashboardUid": "atlas-pods", - "keepTime": false - }, - { - "title": "Nodes dashboard", - "type": "dashboard", - "dashboardUid": "atlas-nodes", - "keepTime": false - }, - { - "title": "Storage dashboard", - "type": "dashboard", - "dashboardUid": "atlas-storage", - "keepTime": false - } - ], + "editable": false, + "annotations": { + "list": [] + }, "panels": [ { "id": 1, @@ -89,7 +55,10 @@ data: } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -123,7 +92,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -137,16 +106,20 @@ data: "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "red", "value": null }, { "color": "green", - "value": 1 + "value": 18 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/18" + } }, "overrides": [] }, @@ -194,16 +167,20 @@ data: "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "red", "value": null }, { "color": "green", - "value": 1 + "value": 3 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/3" + } }, "overrides": [] }, @@ -224,7 +201,7 @@ data: { "id": 4, "type": "stat", - "title": "Control plane schedulable", + "title": "Control plane workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -237,7 +214,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", "refId": "A" } ], @@ -251,16 +228,19 @@ data: "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", + "color": "red", "value": 1 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -276,7 +256,14 @@ data: "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] }, { "id": 5, @@ -294,7 +281,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", "refId": "A" } ], @@ -308,16 +295,19 @@ data: "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", + "color": "red", "value": 1 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -333,7 +323,14 @@ data: "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] }, { "id": 6, @@ -351,10 +348,222 @@ data: }, "targets": [ { - "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", "refId": "A" } ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] + }, + { + "id": 7, + "type": "stat", + "title": "Hottest node: CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 5 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 8, + "type": "stat", + "title": "Hottest node: RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 5 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 9, + "type": "stat", + "title": "Hottest node: NET", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 5 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], "fieldConfig": { "defaults": { "color": { @@ -374,69 +583,10 @@ data: } ] }, - "unit": "none" - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 7, - "type": "stat", - "title": "Hottest node: CPU", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 24, - "y": 0 - }, - "targets": [ - { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "percentage", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 70 - }, - { - "color": "red", - "value": 85 - } - ] - }, - "unit": "percent" + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -452,25 +602,32 @@ data: "values": false }, "textMode": "value_and_name" - } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] }, { - "id": 8, + "id": 10, "type": "stat", - "title": "Hottest node: RAM", + "title": "Hottest node: I/O", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, - "w": 4, - "x": 28, - "y": 0 + "w": 6, + "x": 18, + "y": 5 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", "legendFormat": "{{node}}" } @@ -482,23 +639,22 @@ data: }, "mappings": [], "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", - "value": 70 - }, - { - "color": "red", - "value": 85 + "color": "green", + "value": 1 } ] }, - "unit": "percent" + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -514,10 +670,17 @@ data: "values": false }, "textMode": "value_and_name" - } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] }, { - "id": 9, + "id": 11, "type": "piechart", "title": "Namespace CPU share", "datasource": { @@ -528,18 +691,18 @@ data: "h": 9, "w": 12, "x": 0, - "y": 5 + "y": 10 }, "targets": [ { - "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", - "refId": "A", - "legendFormat": "{{namespace}}" + "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "displayName": "{{namespace}}" }, "overrides": [] }, @@ -559,7 +722,7 @@ data: } }, { - "id": 10, + "id": 12, "type": "piechart", "title": "Namespace RAM share", "datasource": { @@ -570,18 +733,18 @@ data: "h": 9, "w": 12, "x": 12, - "y": 5 + "y": 10 }, "targets": [ { - "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", - "refId": "A", - "legendFormat": "{{namespace}}" + "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "displayName": "{{namespace}}" }, "overrides": [] }, @@ -601,7 +764,7 @@ data: } }, { - "id": 11, + "id": 13, "type": "timeseries", "title": "Cluster node CPU", "datasource": { @@ -612,11 +775,11 @@ data: "h": 8, "w": 12, "x": 0, - "y": 14 + "y": 19 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -638,10 +801,17 @@ data: "tooltip": { "mode": "multi" } - } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] }, { - "id": 12, + "id": 14, "type": "timeseries", "title": "Cluster node RAM", "datasource": { @@ -652,11 +822,11 @@ data: "h": 8, "w": 12, "x": 12, - "y": 14 + "y": 19 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -678,92 +848,19 @@ data: "tooltip": { "mode": "multi" } - } - }, - { - "id": 13, - "type": "table", - "title": "Problem pods (details)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 22 - }, - "targets": [ + "links": [ { - "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "s" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } - ] - }, - { - "id": 14, - "type": "table", - "title": "Terminating >10m", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 22 - }, - "targets": [ - { - "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "s" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "filterByValue", - "options": { - "match": "Value", - "operator": "gt", - "value": 600 - } + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true } ] }, { "id": 15, "type": "timeseries", - "title": "Control plane CPU", + "title": "Control plane CPU (incl. titan-db)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -772,11 +869,11 @@ data: "h": 7, "w": 12, "x": 0, - "y": 30 + "y": 27 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -790,7 +887,7 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right" }, "tooltip": { "mode": "multi" @@ -800,7 +897,7 @@ data: { "id": 16, "type": "timeseries", - "title": "Control plane RAM", + "title": "Control plane RAM (incl. titan-db)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -809,11 +906,11 @@ data: "h": 7, "w": 12, "x": 12, - "y": 30 + "y": 27 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -827,7 +924,7 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right" }, "tooltip": { "mode": "multi" @@ -837,6 +934,92 @@ data: { "id": 17, "type": "timeseries", + "title": "Cluster ingress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 34 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 18, + "type": "timeseries", + "title": "Cluster egress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 34 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 19, + "type": "timeseries", "title": "Root filesystem usage", "datasource": { "type": "prometheus", @@ -846,11 +1029,11 @@ data: "h": 8, "w": 12, "x": 0, - "y": 37 + "y": 41 }, "targets": [ { - "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -873,10 +1056,17 @@ data: "mode": "multi" } }, - "timeFrom": "7d" + "timeFrom": "30d", + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 18, + "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -887,13 +1077,12 @@ data: "h": 8, "w": 12, "x": 12, - "y": 37 + "y": 41 }, "targets": [ { - "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))", - "refId": "A", - "legendFormat": "{{node}}" + "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A" } ], "fieldConfig": { @@ -902,7 +1091,7 @@ data: "min": 0, "max": 100, "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { "color": "green", @@ -921,7 +1110,8 @@ data: "value": 85 } ] - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -935,10 +1125,17 @@ data: "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 19, + "id": 21, "type": "stat", "title": "Astreae usage", "datasource": { @@ -949,7 +1146,7 @@ data: "h": 6, "w": 6, "x": 0, - "y": 45 + "y": 49 }, "targets": [ { @@ -980,7 +1177,10 @@ data: } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -996,10 +1196,17 @@ data: "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 20, + "id": 22, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1010,7 +1217,7 @@ data: "h": 6, "w": 6, "x": 6, - "y": 45 + "y": 49 }, "targets": [ { @@ -1041,7 +1248,10 @@ data: } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -1057,10 +1267,17 @@ data: "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 21, + "id": 23, "type": "stat", "title": "Astreae free", "datasource": { @@ -1071,7 +1288,7 @@ data: "h": 6, "w": 6, "x": 12, - "y": 45 + "y": 49 }, "targets": [ { @@ -1098,7 +1315,10 @@ data: } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -1114,10 +1334,17 @@ data: "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 22, + "id": 24, "type": "stat", "title": "Asteria free", "datasource": { @@ -1128,7 +1355,7 @@ data: "h": 6, "w": 6, "x": 18, - "y": 45 + "y": 49 }, "targets": [ { @@ -1155,7 +1382,10 @@ data: } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -1171,77 +1401,12 @@ data: "values": false }, "textMode": "value" - } - }, - { - "id": 23, - "type": "table", - "title": "Astreae per-node usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 51 - }, - "targets": [ + "links": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } - ] - }, - { - "id": 24, - "type": "table", - "title": "Asteria per-node usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 51 - }, - "targets": [ - { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true } ] }, @@ -1253,16 +1418,15 @@ data: "h": 5, "w": 24, "x": 0, - "y": 59 + "y": 55 }, "datasource": null, "options": { "mode": "markdown", - "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders" + "content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly." } } ], - "refresh": "30s", "schemaVersion": 39, "style": "dark", "tags": [ @@ -1275,5 +1439,31 @@ data: "time": { "from": "now-12h", "to": "now" - } + }, + "links": [ + { + "title": "Atlas Pods", + "type": "dashboard", + "dashboardUid": "atlas-pods", + "keepTime": false + }, + { + "title": "Atlas Nodes", + "type": "dashboard", + "dashboardUid": "atlas-nodes", + "keepTime": false + }, + { + "title": "Atlas Storage", + "type": "dashboard", + "dashboardUid": "atlas-storage", + "keepTime": false + }, + { + "title": "Atlas Network", + "type": "dashboard", + "dashboardUid": "atlas-network", + "keepTime": false + } + ] } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 3b1f5da..58cae77 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -10,11 +10,251 @@ data: { "uid": "atlas-pods", "title": "Atlas Pods", - "folderUid": "atlas-pods", + "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, + "type": "stat", + "title": "Problem pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "CrashLoop / ImagePull", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Stuck terminating (>10m)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Control plane workloads", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, "type": "table", "title": "Pods not running", "datasource": { @@ -25,11 +265,11 @@ data: "h": 10, "w": 24, "x": 0, - "y": 0 + "y": 4 }, "targets": [ { - "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", "refId": "A" } ], @@ -50,7 +290,7 @@ data: ] }, { - "id": 2, + "id": 6, "type": "table", "title": "CrashLoop / ImagePull", "datasource": { @@ -61,11 +301,11 @@ data: "h": 10, "w": 24, "x": 0, - "y": 10 + "y": 14 }, "targets": [ { - "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", + "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", "refId": "A" } ], @@ -86,9 +326,9 @@ data: ] }, { - "id": 3, + "id": 7, "type": "table", - "title": "Terminating pods", + "title": "Terminating >10m", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -97,11 +337,11 @@ data: "h": 10, "w": 24, "x": 0, - "y": 20 + "y": 24 }, "targets": [ { - "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + "expr": "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml index 5b22804..99439fb 100644 --- a/services/monitoring/grafana-dashboard-storage.yaml +++ b/services/monitoring/grafana-dashboard-storage.yaml @@ -10,7 +10,7 @@ data: { "uid": "atlas-storage", "title": "Atlas Storage", - "folderUid": "atlas-storage", + "folderUid": "atlas-internal", "editable": true, "panels": [ { @@ -40,19 +40,26 @@ data: }, "mappings": [], "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -97,19 +104,26 @@ data: }, "mappings": [], "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -166,7 +180,10 @@ data: } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -223,7 +240,10 @@ data: } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -244,20 +264,20 @@ data: { "id": 5, "type": "timeseries", - "title": "Root filesystem", + "title": "Astreae per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, "y": 5 }, "targets": [ { - "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -281,21 +301,59 @@ data: }, { "id": 6, - "type": "table", - "title": "Astreae nodes", + "type": "timeseries", + "title": "Asteria per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 10, + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d" + }, + { + "id": 7, + "type": "timeseries", + "title": "Astreae usage history", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, "w": 12, "x": 0, "y": 14 }, "targets": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], @@ -306,32 +364,33 @@ data: "overrides": [] }, "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" } - ] + }, + "timeFrom": "90d" }, { - "id": 7, - "type": "table", - "title": "Asteria nodes", + "id": 8, + "type": "timeseries", + "title": "Asteria usage history", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 10, + "h": 9, "w": 12, "x": 12, "y": 14 }, "targets": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], @@ -342,14 +401,15 @@ data: "overrides": [] }, "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" } - ] + }, + "timeFrom": "90d" } ], "time": { diff --git a/services/monitoring/grafana-folders.yaml b/services/monitoring/grafana-folders.yaml index d390679..c52b4e1 100644 --- a/services/monitoring/grafana-folders.yaml +++ b/services/monitoring/grafana-folders.yaml @@ -19,22 +19,8 @@ data: permission: Edit - role: Admin permission: Admin - - uid: atlas-pods - title: Atlas Pods - permissions: - - role: Editor - permission: View - - role: Admin - permission: Admin - - uid: atlas-nodes - title: Atlas Nodes - permissions: - - role: Editor - permission: View - - role: Admin - permission: Admin - - uid: atlas-storage - title: Atlas Storage + - uid: atlas-internal + title: Atlas Internal permissions: - role: Editor permission: View diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index e23f903..58035b6 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -288,7 +288,7 @@ spec: path: /var/lib/grafana/dashboards/overview - name: pods orgId: 1 - folder: Atlas Pods + folder: Atlas Internal type: file disableDeletion: false editable: true @@ -296,7 +296,7 @@ spec: path: /var/lib/grafana/dashboards/pods - name: nodes orgId: 1 - folder: Atlas Nodes + folder: Atlas Internal type: file disableDeletion: false editable: true @@ -304,17 +304,26 @@ spec: path: /var/lib/grafana/dashboards/nodes - name: storage orgId: 1 - folder: Atlas Storage + folder: Atlas Internal type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/storage + - name: network + orgId: 1 + folder: Atlas Internal + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/network dashboardsConfigMaps: overview: grafana-dashboard-overview pods: grafana-dashboard-pods nodes: grafana-dashboard-nodes storage: grafana-dashboard-storage + network: grafana-dashboard-network extraConfigmapMounts: - name: grafana-folders mountPath: /etc/grafana/provisioning/folders diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 282ee4f..76263c1 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -9,5 +9,6 @@ resources: - grafana-dashboard-pods.yaml - grafana-dashboard-nodes.yaml - grafana-dashboard-storage.yaml + - grafana-dashboard-network.yaml - grafana-folders.yaml - helmrelease.yaml