From ff056551c7d6fb7cc64dfed73680f8b7a13421e5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 14:08:33 -0300 Subject: [PATCH] monitoring: refresh overview dashboards --- scripts/render_dashboards.py | 182 +-- .../monitoring/dashboards/atlas-network.json | 86 +- .../monitoring/dashboards/atlas-overview.json | 1150 +++++++++-------- services/monitoring/dcgm-exporter.yaml | 74 ++ .../monitoring/grafana-dashboard-network.yaml | 86 +- .../grafana-dashboard-overview.yaml | 1150 +++++++++-------- services/monitoring/kustomization.yaml | 1 + 7 files changed, 1511 insertions(+), 1218 deletions(-) create mode 100644 services/monitoring/dcgm-exporter.yaml diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 937dfb7..273090a 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -165,22 +165,22 @@ def node_io_expr(scope=""): return scoped_node_expr(base, scope) -def namespace_cpu_share_expr(): - selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)" +def namespace_share_expr(resource_expr): + selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )" + total = f"clamp_min(sum( {resource_expr} ), 1)" return f"100 * ( {selected} ) / {total}" +def namespace_cpu_share_expr(): + return namespace_share_expr(NAMESPACE_CPU_RAW) + + def namespace_ram_share_expr(): - selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)" - return f"100 * ( {selected} ) / {total}" + return namespace_share_expr(NAMESPACE_RAM_RAW) def namespace_gpu_share_expr(): - selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" - return f"100 * ( {selected} ) / {total}" + return namespace_share_expr(NAMESPACE_GPU_RAW) PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' @@ -228,35 +228,47 @@ NAMESPACE_GPU_ALLOC = ( 'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}' ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)' ) -NAMESPACE_GPU_USAGE = ( - 'sum(rate(container_accelerator_duty_cycle{namespace!="",accelerator="nvidia.com/gpu"}[5m])) by (namespace)' -) +NAMESPACE_GPU_USAGE = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)' NAMESPACE_GPU_RAW = ( "(" + NAMESPACE_GPU_USAGE + ") or on(namespace) (" - + NAMESPACE_GPU_ALLOC + + NAMESPACE_CPU_RAW + " * 0)" ) -NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_ALLOC -NAMESPACE_COMBINED_FILTER = ( - 'topk(10, (' +NAMESPACE_GPU_WEIGHT = ( + "(" + + NAMESPACE_GPU_ALLOC + + ") or on(namespace) (" + NAMESPACE_CPU_RAW - + ") + (" - + NAMESPACE_RAM_RAW - + ' / 1e9) + (' - + NAMESPACE_GPU_WEIGHT - + " * 10))" + + " * 0)" ) +NAMESPACE_ACTIVITY_SCORE = ( + "( " + + NAMESPACE_CPU_RAW + + " ) + (" + + NAMESPACE_RAM_RAW + + " / 1e9) + (" + + NAMESPACE_GPU_WEIGHT + + " * 100)" +) +NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)" TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" -NET_INGRESS_EXPR = ( - 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))' +TRAEFIK_NET_INGRESS = ( + 'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' " or on() vector(0)" ) -NET_EGRESS_EXPR = ( +TRAEFIK_NET_EGRESS = ( + 'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' + " or on() vector(0)" +) +NET_TOTAL_EXPR = ( 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))' " or on() vector(0)" ) +NET_INGRESS_EXPR = TRAEFIK_NET_INGRESS +NET_EGRESS_EXPR = TRAEFIK_NET_EGRESS +NET_INTERNAL_EXPR = f"clamp_min(({NET_TOTAL_EXPR}) - ({TRAEFIK_NET_EGRESS}), 0)" # --------------------------------------------------------------------------- # Panel factories @@ -438,10 +450,20 @@ def pie_panel(panel_id, title, expr, grid): "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], - "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []}, + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": {"mode": "palette-classic"}, + }, + "overrides": [], + }, "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", + "displayLabels": ["percent"], + "tooltip": {"mode": "single"}, + "colorScheme": "interpolateSpectral", + "colorBy": "value", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, } @@ -511,7 +533,6 @@ def build_overview(): 1, link_to("atlas-pods"), ), - (6, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None), ] for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): thresholds = None @@ -591,12 +612,31 @@ def build_overview(): ) ) + storage_panels = [ + (23, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), + (24, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), + (25, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), + (26, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), + ] + for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): + panels.append( + stat_panel( + panel_id, + title, + expr, + {"h": 6, "w": 6, "x": 6 * idx, "y": 10}, + unit=unit, + thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, + links=link_to("atlas-storage"), + ) + ) + panels.append( pie_panel( 11, "Namespace CPU share", namespace_cpu_share_expr(), - {"h": 9, "w": 8, "x": 0, "y": 10}, + {"h": 9, "w": 8, "x": 0, "y": 16}, ) ) panels.append( @@ -604,7 +644,7 @@ def build_overview(): 12, "Namespace GPU share", namespace_gpu_share_expr(), - {"h": 9, "w": 8, "x": 8, "y": 10}, + {"h": 9, "w": 8, "x": 8, "y": 16}, ) ) panels.append( @@ -612,7 +652,7 @@ def build_overview(): 13, "Namespace RAM share", namespace_ram_share_expr(), - {"h": 9, "w": 8, "x": 16, "y": 10}, + {"h": 9, "w": 8, "x": 16, "y": 16}, ) ) @@ -622,7 +662,7 @@ def build_overview(): 14, "Worker node CPU", node_cpu_expr(worker_filter), - {"h": 8, "w": 12, "x": 0, "y": 19}, + {"h": 8, "w": 12, "x": 0, "y": 25}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -636,7 +676,7 @@ def build_overview(): 15, "Worker node RAM", node_mem_expr(worker_filter), - {"h": 8, "w": 12, "x": 12, "y": 19}, + {"h": 8, "w": 12, "x": 12, "y": 25}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -651,7 +691,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_REGEX), - {"h": 7, "w": 12, "x": 0, "y": 27}, + {"h": 7, "w": 12, "x": 0, "y": 33}, unit="percent", legend="{{node}}", legend_display="table", @@ -663,7 +703,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_REGEX), - {"h": 7, "w": 12, "x": 12, "y": 27}, + {"h": 7, "w": 12, "x": 12, "y": 33}, unit="percent", legend="{{node}}", legend_display="table", @@ -676,9 +716,9 @@ def build_overview(): 18, "Cluster ingress throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 12, "x": 0, "y": 34}, + {"h": 7, "w": 8, "x": 0, "y": 40}, unit="Bps", - legend="Ingress", + legend="Ingress (Traefik)", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), @@ -689,9 +729,22 @@ def build_overview(): 19, "Cluster egress throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 12, "x": 12, "y": 34}, + {"h": 7, "w": 8, "x": 8, "y": 40}, unit="Bps", - legend="Egress", + legend="Egress (Traefik)", + legend_display="list", + legend_placement="bottom", + links=link_to("atlas-network"), + ) + ) + panels.append( + timeseries_panel( + 20, + "Intra-cluster throughput", + NET_INTERNAL_EXPR, + {"h": 7, "w": 8, "x": 16, "y": 40}, + unit="Bps", + legend="Internal traffic", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), @@ -700,10 +753,10 @@ def build_overview(): panels.append( timeseries_panel( - 20, + 21, "Root filesystem usage", root_usage_expr(), - {"h": 8, "w": 12, "x": 0, "y": 41}, + {"h": 8, "w": 12, "x": 0, "y": 47}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -715,11 +768,11 @@ def build_overview(): ) panels.append( { - "id": 21, + "id": 22, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": PROM_DS, - "gridPos": {"h": 8, "w": 12, "x": 12, "y": 41}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 47}, "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}], "fieldConfig": { "defaults": { @@ -744,28 +797,10 @@ def build_overview(): "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, "links": link_to("atlas-storage"), + "transformations": [{"id": "labelsToFields", "options": {}}], } ) - storage_panels = [ - (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), - (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), - (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), - (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), - ] - for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): - panels.append( - stat_panel( - panel_id, - title, - expr, - {"h": 6, "w": 6, "x": 6 * idx, "y": 49}, - unit=unit, - thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, - links=link_to("atlas-storage"), - ) - ) - return { "uid": "atlas-overview", "title": "Atlas Overview", @@ -1110,12 +1145,15 @@ def build_network_dashboard(): panels.append( stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps") ) + panels.append( + stat_panel(3, "Intra-cluster traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 0}, unit="Bps") + ) panels.append( stat_panel( - 3, + 4, "Top router req/s", f"topk(1, {TRAEFIK_ROUTER_EXPR})", - {"h": 4, "w": 8, "x": 16, "y": 0}, + {"h": 4, "w": 8, "x": 0, "y": 4}, unit="req/s", legend="{{router}}", instant=True, @@ -1123,10 +1161,10 @@ def build_network_dashboard(): ) panels.append( timeseries_panel( - 4, + 5, "Per-node throughput", node_net_expr(), - {"h": 8, "w": 24, "x": 0, "y": 4}, + {"h": 8, "w": 24, "x": 0, "y": 8}, unit="Bps", legend="{{node}}", legend_display="table", @@ -1135,32 +1173,32 @@ def build_network_dashboard(): ) panels.append( table_panel( - 5, + 6, "Top namespaces", 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', - {"h": 9, "w": 12, "x": 0, "y": 12}, + {"h": 9, "w": 12, "x": 0, "y": 16}, unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( - 6, + 7, "Top pods", 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', - {"h": 9, "w": 12, "x": 12, "y": 12}, + {"h": 9, "w": 12, "x": 12, "y": 16}, unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( timeseries_panel( - 7, + 8, "Traefik routers (req/s)", f"topk(10, {TRAEFIK_ROUTER_EXPR})", - {"h": 9, "w": 12, "x": 0, "y": 21}, + {"h": 9, "w": 12, "x": 0, "y": 25}, unit="req/s", legend="{{router}}", legend_display="table", @@ -1169,10 +1207,10 @@ def build_network_dashboard(): ) panels.append( timeseries_panel( - 8, + 9, "Traefik entrypoints (req/s)", 'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))', - {"h": 9, "w": 12, "x": 12, "y": 21}, + {"h": 9, "w": 12, "x": 12, "y": 25}, unit="req/s", legend="{{entrypoint}}", legend_display="table", diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 098e1db..1baec3a 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -80,7 +80,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -127,7 +127,7 @@ { "id": 3, "type": "stat", - "title": "Top router req/s", + "title": "Intra-cluster traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -138,6 +138,66 @@ "x": 16, "y": 0 }, + "targets": [ + { + "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "Bps", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Top router req/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 4 + }, "targets": [ { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", @@ -187,7 +247,7 @@ } }, { - "id": 4, + "id": 5, "type": "timeseries", "title": "Per-node throughput", "datasource": { @@ -198,7 +258,7 @@ "h": 8, "w": 24, "x": 0, - "y": 4 + "y": 8 }, "targets": [ { @@ -224,7 +284,7 @@ } }, { - "id": 5, + "id": 6, "type": "table", "title": "Top namespaces", "datasource": { @@ -235,7 +295,7 @@ "h": 9, "w": 12, "x": 0, - "y": 12 + "y": 16 }, "targets": [ { @@ -260,7 +320,7 @@ ] }, { - "id": 6, + "id": 7, "type": "table", "title": "Top pods", "datasource": { @@ -271,7 +331,7 @@ "h": 9, "w": 12, "x": 12, - "y": 12 + "y": 16 }, "targets": [ { @@ -296,7 +356,7 @@ ] }, { - "id": 7, + "id": 8, "type": "timeseries", "title": "Traefik routers (req/s)", "datasource": { @@ -307,7 +367,7 @@ "h": 9, "w": 12, "x": 0, - "y": 21 + "y": 25 }, "targets": [ { @@ -333,7 +393,7 @@ } }, { - "id": 8, + "id": 9, "type": "timeseries", "title": "Traefik entrypoints (req/s)", "datasource": { @@ -344,7 +404,7 @@ "h": 9, "w": 12, "x": 12, - "y": 21 + "y": 25 }, "targets": [ { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index ad460bb..eba6466 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -325,59 +325,6 @@ } ] }, - { - "id": 6, - "type": "gauge", - "title": "Running pods", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 20, - "y": 0 - }, - "targets": [ - { - "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "min": 0, - "max": 5, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "options": { - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false - } - }, { "id": 7, "type": "stat", @@ -663,506 +610,7 @@ ] }, { - "id": 11, - "type": "piechart", - "title": "Namespace CPU share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 0, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 12, - "type": "piechart", - "title": "Namespace GPU share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 8, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 13, - "type": "piechart", - "title": "Namespace RAM share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 14, - "type": "timeseries", - "title": "Worker node CPU", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 19 - }, - "targets": [ - { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-nodes dashboard", - "url": "/d/atlas-nodes", - "targetBlank": true - } - ] - }, - { - "id": 15, - "type": "timeseries", - "title": "Worker node RAM", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 19 - }, - "targets": [ - { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-nodes dashboard", - "url": "/d/atlas-nodes", - "targetBlank": true - } - ] - }, - { - "id": 16, - "type": "timeseries", - "title": "Control plane CPU", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 27 - }, - "targets": [ - { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 17, - "type": "timeseries", - "title": "Control plane RAM", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 27 - }, - "targets": [ - { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 18, - "type": "timeseries", - "title": "Cluster ingress throughput", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", - "refId": "A", - "legendFormat": "Ingress" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-network dashboard", - "url": "/d/atlas-network", - "targetBlank": true - } - ] - }, - { - "id": 19, - "type": "timeseries", - "title": "Cluster egress throughput", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", - "refId": "A", - "legendFormat": "Egress" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-network dashboard", - "url": "/d/atlas-network", - "targetBlank": true - } - ] - }, - { - "id": 20, - "type": "timeseries", - "title": "Root filesystem usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 41 - }, - "targets": [ - { - "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "timeFrom": "30d", - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 21, - "type": "bargauge", - "title": "Nodes closest to full root disks", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 41 - }, - "targets": [ - { - "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 70 - }, - { - "color": "red", - "value": 85 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 21, + "id": 23, "type": "stat", "title": "Astreae usage", "datasource": { @@ -1173,7 +621,7 @@ "h": 6, "w": 6, "x": 0, - "y": 49 + "y": 10 }, "targets": [ { @@ -1233,7 +681,7 @@ ] }, { - "id": 22, + "id": 24, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1244,7 +692,7 @@ "h": 6, "w": 6, "x": 6, - "y": 49 + "y": 10 }, "targets": [ { @@ -1304,7 +752,7 @@ ] }, { - "id": 23, + "id": 25, "type": "stat", "title": "Astreae free", "datasource": { @@ -1315,7 +763,7 @@ "h": 6, "w": 6, "x": 12, - "y": 49 + "y": 10 }, "targets": [ { @@ -1371,7 +819,7 @@ ] }, { - "id": 24, + "id": 26, "type": "stat", "title": "Asteria free", "datasource": { @@ -1382,7 +830,7 @@ "h": 6, "w": 6, "x": 18, - "y": 49 + "y": 10 }, "targets": [ { @@ -1436,6 +884,588 @@ "targetBlank": true } ] + }, + { + "id": 11, + "type": "piechart", + "title": "Namespace CPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 16 + }, + "targets": [ + { + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 12, + "type": "piechart", + "title": "Namespace GPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 16 + }, + "targets": [ + { + "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 13, + "type": "piechart", + "title": "Namespace RAM share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 16 + }, + "targets": [ + { + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 14, + "type": "timeseries", + "title": "Worker node CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "targets": [ + { + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 15, + "type": "timeseries", + "title": "Worker node RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "targets": [ + { + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 16, + "type": "timeseries", + "title": "Control plane CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 33 + }, + "targets": [ + { + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 17, + "type": "timeseries", + "title": "Control plane RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 33 + }, + "targets": [ + { + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 18, + "type": "timeseries", + "title": "Cluster ingress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 40 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "refId": "A", + "legendFormat": "Ingress (Traefik)" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 19, + "type": "timeseries", + "title": "Cluster egress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 40 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "refId": "A", + "legendFormat": "Egress (Traefik)" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 20, + "type": "timeseries", + "title": "Intra-cluster throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 40 + }, + "targets": [ + { + "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "refId": "A", + "legendFormat": "Internal traffic" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 21, + "type": "timeseries", + "title": "Root filesystem usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 47 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d", + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] + }, + { + "id": 22, + "type": "bargauge", + "title": "Nodes closest to full root disks", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 47 + }, + "targets": [ + { + "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ], + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] } ], "schemaVersion": 39, diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml new file mode 100644 index 0000000..efd32c5 --- /dev/null +++ b/services/monitoring/dcgm-exporter.yaml @@ -0,0 +1,74 @@ +# services/monitoring/dcgm-exporter.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: dcgm-exporter + namespace: monitoring + labels: + app: dcgm-exporter +spec: + selector: + matchLabels: + app: dcgm-exporter + template: + metadata: + labels: + app: dcgm-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9400" + spec: + serviceAccountName: default + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - titan-20 + - titan-21 + - titan-22 + - titan-24 + tolerations: + - operator: Exists + containers: + - name: dcgm-exporter + image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-1 + imagePullPolicy: IfNotPresent + ports: + - name: metrics + containerPort: 9400 + env: + - name: DCGM_EXPORTER_KUBERNETES + value: "true" + securityContext: + privileged: true + resources: + requests: + cpu: 50m + memory: 64Mi + volumeMounts: + - name: pod-resources + mountPath: /var/lib/kubelet/pod-resources + volumes: + - name: pod-resources + hostPath: + path: /var/lib/kubelet/pod-resources + type: Directory +--- +apiVersion: v1 +kind: Service +metadata: + name: dcgm-exporter + namespace: monitoring + labels: + app: dcgm-exporter +spec: + selector: + app: dcgm-exporter + ports: + - name: metrics + port: 9400 + targetPort: metrics diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index a552793..ade7457 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -89,7 +89,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -136,7 +136,7 @@ data: { "id": 3, "type": "stat", - "title": "Top router req/s", + "title": "Intra-cluster traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -147,6 +147,66 @@ data: "x": 16, "y": 0 }, + "targets": [ + { + "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "Bps", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Top router req/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 4 + }, "targets": [ { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", @@ -196,7 +256,7 @@ data: } }, { - "id": 4, + "id": 5, "type": "timeseries", "title": "Per-node throughput", "datasource": { @@ -207,7 +267,7 @@ data: "h": 8, "w": 24, "x": 0, - "y": 4 + "y": 8 }, "targets": [ { @@ -233,7 +293,7 @@ data: } }, { - "id": 5, + "id": 6, "type": "table", "title": "Top namespaces", "datasource": { @@ -244,7 +304,7 @@ data: "h": 9, "w": 12, "x": 0, - "y": 12 + "y": 16 }, "targets": [ { @@ -269,7 +329,7 @@ data: ] }, { - "id": 6, + "id": 7, "type": "table", "title": "Top pods", "datasource": { @@ -280,7 +340,7 @@ data: "h": 9, "w": 12, "x": 12, - "y": 12 + "y": 16 }, "targets": [ { @@ -305,7 +365,7 @@ data: ] }, { - "id": 7, + "id": 8, "type": "timeseries", "title": "Traefik routers (req/s)", "datasource": { @@ -316,7 +376,7 @@ data: "h": 9, "w": 12, "x": 0, - "y": 21 + "y": 25 }, "targets": [ { @@ -342,7 +402,7 @@ data: } }, { - "id": 8, + "id": 9, "type": "timeseries", "title": "Traefik entrypoints (req/s)", "datasource": { @@ -353,7 +413,7 @@ data: "h": 9, "w": 12, "x": 12, - "y": 21 + "y": 25 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 6503da9..d20a5a4 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -334,59 +334,6 @@ data: } ] }, - { - "id": 6, - "type": "gauge", - "title": "Running pods", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 20, - "y": 0 - }, - "targets": [ - { - "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "min": 0, - "max": 5, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "options": { - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false - } - }, { "id": 7, "type": "stat", @@ -672,506 +619,7 @@ data: ] }, { - "id": 11, - "type": "piechart", - "title": "Namespace CPU share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 0, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 12, - "type": "piechart", - "title": "Namespace GPU share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 8, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 13, - "type": "piechart", - "title": "Namespace RAM share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 14, - "type": "timeseries", - "title": "Worker node CPU", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 19 - }, - "targets": [ - { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-nodes dashboard", - "url": "/d/atlas-nodes", - "targetBlank": true - } - ] - }, - { - "id": 15, - "type": "timeseries", - "title": "Worker node RAM", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 19 - }, - "targets": [ - { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-nodes dashboard", - "url": "/d/atlas-nodes", - "targetBlank": true - } - ] - }, - { - "id": 16, - "type": "timeseries", - "title": "Control plane CPU", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 27 - }, - "targets": [ - { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 17, - "type": "timeseries", - "title": "Control plane RAM", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 27 - }, - "targets": [ - { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 18, - "type": "timeseries", - "title": "Cluster ingress throughput", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", - "refId": "A", - "legendFormat": "Ingress" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-network dashboard", - "url": "/d/atlas-network", - "targetBlank": true - } - ] - }, - { - "id": 19, - "type": "timeseries", - "title": "Cluster egress throughput", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", - "refId": "A", - "legendFormat": "Egress" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-network dashboard", - "url": "/d/atlas-network", - "targetBlank": true - } - ] - }, - { - "id": 20, - "type": "timeseries", - "title": "Root filesystem usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 41 - }, - "targets": [ - { - "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "timeFrom": "30d", - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 21, - "type": "bargauge", - "title": "Nodes closest to full root disks", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 41 - }, - "targets": [ - { - "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 70 - }, - { - "color": "red", - "value": 85 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 21, + "id": 23, "type": "stat", "title": "Astreae usage", "datasource": { @@ -1182,7 +630,7 @@ data: "h": 6, "w": 6, "x": 0, - "y": 49 + "y": 10 }, "targets": [ { @@ -1242,7 +690,7 @@ data: ] }, { - "id": 22, + "id": 24, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1253,7 +701,7 @@ data: "h": 6, "w": 6, "x": 6, - "y": 49 + "y": 10 }, "targets": [ { @@ -1313,7 +761,7 @@ data: ] }, { - "id": 23, + "id": 25, "type": "stat", "title": "Astreae free", "datasource": { @@ -1324,7 +772,7 @@ data: "h": 6, "w": 6, "x": 12, - "y": 49 + "y": 10 }, "targets": [ { @@ -1380,7 +828,7 @@ data: ] }, { - "id": 24, + "id": 26, "type": "stat", "title": "Asteria free", "datasource": { @@ -1391,7 +839,7 @@ data: "h": 6, "w": 6, "x": 18, - "y": 49 + "y": 10 }, "targets": [ { @@ -1445,6 +893,588 @@ data: "targetBlank": true } ] + }, + { + "id": 11, + "type": "piechart", + "title": "Namespace CPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 16 + }, + "targets": [ + { + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 12, + "type": "piechart", + "title": "Namespace GPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 16 + }, + "targets": [ + { + "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 13, + "type": "piechart", + "title": "Namespace RAM share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 16 + }, + "targets": [ + { + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 14, + "type": "timeseries", + "title": "Worker node CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "targets": [ + { + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 15, + "type": "timeseries", + "title": "Worker node RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "targets": [ + { + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 16, + "type": "timeseries", + "title": "Control plane CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 33 + }, + "targets": [ + { + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 17, + "type": "timeseries", + "title": "Control plane RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 33 + }, + "targets": [ + { + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 18, + "type": "timeseries", + "title": "Cluster ingress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 40 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "refId": "A", + "legendFormat": "Ingress (Traefik)" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 19, + "type": "timeseries", + "title": "Cluster egress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 40 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "refId": "A", + "legendFormat": "Egress (Traefik)" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 20, + "type": "timeseries", + "title": "Intra-cluster throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 40 + }, + "targets": [ + { + "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "refId": "A", + "legendFormat": "Internal traffic" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 21, + "type": "timeseries", + "title": "Root filesystem usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 47 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d", + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] + }, + { + "id": 22, + "type": "bargauge", + "title": "Nodes closest to full root disks", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 47 + }, + "targets": [ + { + "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ], + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] } ], "schemaVersion": 39, diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 76263c1..3164862 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -10,5 +10,6 @@ resources: - grafana-dashboard-nodes.yaml - grafana-dashboard-storage.yaml - grafana-dashboard-network.yaml + - dcgm-exporter.yaml - grafana-folders.yaml - helmrelease.yaml