diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 97070d2..11bd2c8 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -32,7 +32,7 @@ data: ) PROM_DS = {"type": "prometheus", "uid": "atlas-vm"} -PUBLIC_FOLDER = "atlas-overview" +PUBLIC_FOLDER = "overview" PRIVATE_FOLDER = "atlas-internal" PERCENT_THRESHOLDS = { @@ -231,10 +231,13 @@ NAMESPACE_GPU_ALLOC = ( 'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}' ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)' ) -NAMESPACE_GPU_USAGE = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)' +NAMESPACE_GPU_USAGE_SHARE = ( + 'avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)' +) +NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)' NAMESPACE_GPU_RAW = ( "(" - + NAMESPACE_GPU_USAGE + + NAMESPACE_GPU_USAGE_SHARE + ") or on(namespace) (" + NAMESPACE_CPU_RAW + " * 0)" @@ -519,7 +522,7 @@ def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None): "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"], - "fields": "/.*/", + "fields": "Value", "values": False, }, }, @@ -555,7 +558,7 @@ def build_overview(): row1_stats = [ ( 1, - "Workers ready", + "Workers Ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', WORKER_SUFFIX, WORKER_TOTAL, @@ -563,7 +566,7 @@ def build_overview(): ), ( 2, - "Control plane ready", + "Control Plane Ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', CONTROL_SUFFIX, CONTROL_TOTAL, @@ -571,7 +574,7 @@ def build_overview(): ), ( 3, - "Control plane workloads", + "Control Plane Workloads", CONTROL_WORKLOADS_EXPR, None, 4, @@ -579,7 +582,7 @@ def build_overview(): ), ( 4, - "Problem pods", + "Problem Pods", PROBLEM_PODS_EXPR, None, 1, @@ -587,7 +590,7 @@ def build_overview(): ), ( 5, - "Stuck terminating", + "Stuck Terminating", STUCK_TERMINATING_EXPR, None, 1, @@ -644,7 +647,7 @@ def build_overview(): ], } width, x = gauge_grid(idx) - if panel_id == 3: + if panel_id in (3, 4, 5): panels.append( stat_panel( panel_id, @@ -654,6 +657,7 @@ def build_overview(): thresholds=thresholds, legend=None, links=links, + text_mode="value", ) ) else: @@ -693,10 +697,10 @@ def build_overview(): ) storage_panels = [ - (23, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), - (24, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), - (25, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), - (26, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), + (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), + (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"), + (25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"), + (26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"), ] for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): panels.append( @@ -714,7 +718,7 @@ def build_overview(): panels.append( pie_panel( 11, - "Namespace CPU share", + "Namespace CPU Share", namespace_cpu_share_expr(), {"h": 9, "w": 8, "x": 0, "y": 16}, ) @@ -722,7 +726,7 @@ def build_overview(): panels.append( pie_panel( 12, - "Namespace GPU share", + "Namespace GPU Share", namespace_gpu_share_expr(), {"h": 9, "w": 8, "x": 8, "y": 16}, ) @@ -730,7 +734,7 @@ def build_overview(): panels.append( pie_panel( 13, - "Namespace RAM share", + "Namespace RAM Share", namespace_ram_share_expr(), {"h": 9, "w": 8, "x": 16, "y": 16}, ) @@ -740,7 +744,7 @@ def build_overview(): panels.append( timeseries_panel( 14, - "Worker node CPU", + "Worker Node CPU", node_cpu_expr(worker_filter), {"h": 8, "w": 12, "x": 0, "y": 32}, unit="percent", @@ -754,7 +758,7 @@ def build_overview(): panels.append( timeseries_panel( 15, - "Worker node RAM", + "Worker Node RAM", node_mem_expr(worker_filter), {"h": 8, "w": 12, "x": 12, "y": 32}, unit="percent", @@ -794,7 +798,7 @@ def build_overview(): panels.append( timeseries_panel( 18, - "Cluster ingress throughput", + "Cluster Ingress Throughput", NET_INGRESS_EXPR, {"h": 7, "w": 8, "x": 0, "y": 25}, unit="Bps", @@ -807,7 +811,7 @@ def build_overview(): panels.append( timeseries_panel( 19, - "Cluster egress throughput", + "Cluster Egress Throughput", NET_EGRESS_EXPR, {"h": 7, "w": 8, "x": 8, "y": 25}, unit="Bps", @@ -820,7 +824,7 @@ def build_overview(): panels.append( timeseries_panel( 20, - "Intra-cluster throughput", + "Intra-Cluster Throughput", NET_INTERNAL_EXPR, {"h": 7, "w": 8, "x": 16, "y": 25}, unit="Bps", @@ -834,7 +838,7 @@ def build_overview(): panels.append( timeseries_panel( 21, - "Root filesystem usage", + "Root Filesystem Usage", root_usage_expr(), {"h": 8, "w": 12, "x": 0, "y": 47}, unit="percent", @@ -849,7 +853,7 @@ def build_overview(): panels.append( bargauge_panel( 22, - "Nodes closest to full root disks", + "Nodes Closest to Full Root Disks", f"topk(8, {root_usage_expr()})", {"h": 8, "w": 12, "x": 12, "y": 47}, unit="percent", @@ -868,7 +872,8 @@ def build_overview(): "style": "dark", "tags": ["atlas", "overview"], "templating": {"list": []}, - "time": {"from": "now-12h", "to": "now"}, + "time": {"from": "now-1h", "to": "now"}, + "refresh": "1m", "links": [ {"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False}, {"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False}, @@ -884,7 +889,7 @@ def build_pods_dashboard(): panels.append( stat_panel( 1, - "Problem pods", + "Problem Pods", PROBLEM_PODS_EXPR, {"h": 4, "w": 6, "x": 0, "y": 0}, thresholds={ @@ -914,7 +919,7 @@ def build_pods_dashboard(): panels.append( stat_panel( 3, - "Stuck terminating (>10m)", + "Stuck Terminating (>10m)", STUCK_TERMINATING_EXPR, {"h": 4, "w": 6, "x": 12, "y": 0}, thresholds={ @@ -929,7 +934,7 @@ def build_pods_dashboard(): panels.append( stat_panel( 4, - "Control plane workloads", + "Control Plane Workloads", f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', {"h": 4, "w": 6, "x": 18, "y": 0}, thresholds={ @@ -945,7 +950,7 @@ def build_pods_dashboard(): panels.append( table_panel( 5, - "Pods not running", + "Pods Not Running", PROBLEM_TABLE_EXPR, {"h": 10, "w": 24, "x": 0, "y": 4}, unit="s", @@ -994,7 +999,7 @@ def build_nodes_dashboard(): panels.append( stat_panel( 1, - "Worker nodes ready", + "Worker Nodes Ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', {"h": 4, "w": 8, "x": 0, "y": 0}, value_suffix=WORKER_SUFFIX, @@ -1003,7 +1008,7 @@ def build_nodes_dashboard(): panels.append( stat_panel( 2, - "Control plane ready", + "Control Plane Ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', {"h": 4, "w": 8, "x": 8, "y": 0}, value_suffix=CONTROL_SUFFIX, @@ -1012,7 +1017,7 @@ def build_nodes_dashboard(): panels.append( stat_panel( 3, - "Control plane workloads", + "Control Plane Workloads", f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', {"h": 4, "w": 8, "x": 16, "y": 0}, ) @@ -1046,7 +1051,7 @@ def build_nodes_dashboard(): panels.append( timeseries_panel( 6, - "Control plane (incl. titan-db) CPU", + "Control Plane (incl. titan-db) CPU", node_cpu_expr(CONTROL_ALL_REGEX), {"h": 9, "w": 12, "x": 0, "y": 22}, unit="percent", @@ -1058,7 +1063,7 @@ def build_nodes_dashboard(): panels.append( timeseries_panel( 7, - "Control plane (incl. titan-db) RAM", + "Control Plane (incl. titan-db) RAM", node_mem_expr(CONTROL_ALL_REGEX), {"h": 9, "w": 12, "x": 12, "y": 22}, unit="percent", @@ -1070,7 +1075,7 @@ def build_nodes_dashboard(): panels.append( timeseries_panel( 8, - "Root filesystem usage", + "Root Filesystem Usage", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 31}, unit="percent", @@ -1099,7 +1104,7 @@ def build_storage_dashboard(): panels.append( stat_panel( 1, - "Astreae usage", + "Astreae Usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent", @@ -1109,7 +1114,7 @@ def build_storage_dashboard(): panels.append( stat_panel( 2, - "Asteria usage", + "Asteria Usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent", @@ -1119,7 +1124,7 @@ def build_storage_dashboard(): panels.append( stat_panel( 3, - "Astreae free", + "Astreae Free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="decbytes", @@ -1128,7 +1133,7 @@ def build_storage_dashboard(): panels.append( stat_panel( 4, - "Asteria free", + "Asteria Free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="decbytes", @@ -1137,7 +1142,7 @@ def build_storage_dashboard(): panels.append( timeseries_panel( 5, - "Astreae per-node usage", + "Astreae Per-Node Usage", filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 0, "y": 5}, unit="percent", @@ -1150,7 +1155,7 @@ def build_storage_dashboard(): panels.append( timeseries_panel( 6, - "Asteria per-node usage", + "Asteria Per-Node Usage", filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 12, "y": 5}, unit="percent", @@ -1163,7 +1168,7 @@ def build_storage_dashboard(): panels.append( timeseries_panel( 7, - "Astreae usage history", + "Astreae Usage History", astreae_usage_expr("/mnt/astreae"), {"h": 9, "w": 12, "x": 0, "y": 14}, unit="percent", @@ -1173,7 +1178,7 @@ def build_storage_dashboard(): panels.append( timeseries_panel( 8, - "Asteria usage history", + "Asteria Usage History", astreae_usage_expr("/mnt/asteria"), {"h": 9, "w": 12, "x": 12, "y": 14}, unit="percent", @@ -1199,7 +1204,7 @@ def build_network_dashboard(): panels.append( stat_panel( 1, - "Ingress traffic", + "Ingress Traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="Bps", @@ -1208,7 +1213,7 @@ def build_network_dashboard(): panels.append( stat_panel( 2, - "Egress traffic", + "Egress Traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps", @@ -1217,7 +1222,7 @@ def build_network_dashboard(): panels.append( stat_panel( 3, - "Intra-cluster traffic", + "Intra-Cluster Traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 0}, unit="Bps", @@ -1226,7 +1231,7 @@ def build_network_dashboard(): panels.append( stat_panel( 4, - "Top router req/s", + "Top Router req/s", f"topk(1, {TRAEFIK_ROUTER_EXPR})", {"h": 4, "w": 8, "x": 0, "y": 4}, unit="req/s", @@ -1236,7 +1241,7 @@ def build_network_dashboard(): panels.append( timeseries_panel( 5, - "Per-node throughput", + "Per-Node Throughput", f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})', {"h": 8, "w": 24, "x": 0, "y": 8}, unit="Bps", @@ -1248,7 +1253,7 @@ def build_network_dashboard(): panels.append( table_panel( 6, - "Top namespaces", + "Top Namespaces", 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 16}, @@ -1259,7 +1264,7 @@ def build_network_dashboard(): panels.append( table_panel( 7, - "Top pods", + "Top Pods", 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', {"h": 9, "w": 12, "x": 12, "y": 16}, @@ -1270,7 +1275,7 @@ def build_network_dashboard(): panels.append( timeseries_panel( 8, - "Traefik routers (req/s)", + "Traefik Routers (req/s)", f"topk(10, {TRAEFIK_ROUTER_EXPR})", {"h": 9, "w": 12, "x": 0, "y": 25}, unit="req/s", @@ -1282,7 +1287,7 @@ def build_network_dashboard(): panels.append( timeseries_panel( 9, - "Traefik entrypoints (req/s)", + "Traefik Entrypoints (req/s)", 'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))', {"h": 9, "w": 12, "x": 12, "y": 25}, unit="req/s", @@ -1310,7 +1315,7 @@ def build_gpu_dashboard(): panels.append( pie_panel( 1, - "Namespace GPU share", + "Namespace GPU Share", namespace_gpu_share_expr(), {"h": 8, "w": 12, "x": 0, "y": 0}, ) @@ -1318,8 +1323,8 @@ def build_gpu_dashboard(): panels.append( timeseries_panel( 2, - "GPU util by namespace", - NAMESPACE_GPU_USAGE, + "GPU Util by Namespace", + NAMESPACE_GPU_USAGE_INSTANT, {"h": 8, "w": 12, "x": 12, "y": 0}, unit="percent", legend="{{namespace}}", @@ -1330,7 +1335,7 @@ def build_gpu_dashboard(): panels.append( timeseries_panel( 3, - "GPU util by node", + "GPU Util by Node", 'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})', {"h": 8, "w": 12, "x": 0, "y": 8}, unit="percent", @@ -1342,7 +1347,7 @@ def build_gpu_dashboard(): panels.append( table_panel( 4, - "Top pods by GPU util", + "Top Pods by GPU Util", 'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))', {"h": 8, "w": 12, "x": 12, "y": 8}, unit="percent", diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index da235a5..8c1367b 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -7,7 +7,7 @@ { "id": 1, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -60,7 +60,7 @@ { "id": 2, "type": "timeseries", - "title": "GPU util by namespace", + "title": "GPU Util by Namespace", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -97,7 +97,7 @@ { "id": 3, "type": "timeseries", - "title": "GPU util by node", + "title": "GPU Util by Node", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -134,7 +134,7 @@ { "id": 4, "type": "table", - "title": "Top pods by GPU util", + "title": "Top Pods by GPU Util", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index f2291b7..ff0af9b 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -7,7 +7,7 @@ { "id": 1, "type": "stat", - "title": "Ingress traffic", + "title": "Ingress Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -67,7 +67,7 @@ { "id": 2, "type": "stat", - "title": "Egress traffic", + "title": "Egress Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -127,7 +127,7 @@ { "id": 3, "type": "stat", - "title": "Intra-cluster traffic", + "title": "Intra-Cluster Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -187,7 +187,7 @@ { "id": 4, "type": "stat", - "title": "Top router req/s", + "title": "Top Router req/s", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -248,7 +248,7 @@ { "id": 5, "type": "timeseries", - "title": "Per-node throughput", + "title": "Per-Node Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -285,7 +285,7 @@ { "id": 6, "type": "table", - "title": "Top namespaces", + "title": "Top Namespaces", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -321,7 +321,7 @@ { "id": 7, "type": "table", - "title": "Top pods", + "title": "Top Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -357,7 +357,7 @@ { "id": 8, "type": "timeseries", - "title": "Traefik routers (req/s)", + "title": "Traefik Routers (req/s)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -394,7 +394,7 @@ { "id": 9, "type": "timeseries", - "title": "Traefik entrypoints (req/s)", + "title": "Traefik Entrypoints (req/s)", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index 3cf784f..802fe5a 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -7,7 +7,7 @@ { "id": 1, "type": "stat", - "title": "Worker nodes ready", + "title": "Worker Nodes Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -68,7 +68,7 @@ { "id": 2, "type": "stat", - "title": "Control plane ready", + "title": "Control Plane Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -129,7 +129,7 @@ { "id": 3, "type": "stat", - "title": "Control plane workloads", + "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -269,7 +269,7 @@ { "id": 6, "type": "timeseries", - "title": "Control plane (incl. titan-db) CPU", + "title": "Control Plane (incl. titan-db) CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -306,7 +306,7 @@ { "id": 7, "type": "timeseries", - "title": "Control plane (incl. titan-db) RAM", + "title": "Control Plane (incl. titan-db) RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -343,7 +343,7 @@ { "id": 8, "type": "timeseries", - "title": "Root filesystem usage", + "title": "Root Filesystem Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 4e3c357..b556594 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1,7 +1,7 @@ { "uid": "atlas-overview", "title": "Atlas Overview", - "folderUid": "atlas-overview", + "folderUid": "overview", "editable": false, "annotations": { "list": [] @@ -10,7 +10,7 @@ { "id": 1, "type": "gauge", - "title": "Workers ready", + "title": "Workers Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -71,7 +71,7 @@ { "id": 2, "type": "gauge", - "title": "Control plane ready", + "title": "Control Plane Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -124,7 +124,7 @@ { "id": 3, "type": "stat", - "title": "Control plane workloads", + "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -198,8 +198,8 @@ }, { "id": 4, - "type": "gauge", - "title": "Problem pods", + "type": "stat", + "title": "Problem Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -218,8 +218,10 @@ ], "fieldConfig": { "defaults": { - "min": 0, - "max": 4, + "color": { + "mode": "palette-classic" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -240,11 +242,18 @@ "value": 3 } ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" } }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -252,9 +261,7 @@ "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "value" }, "links": [ { @@ -266,8 +273,8 @@ }, { "id": 5, - "type": "gauge", - "title": "Stuck terminating", + "type": "stat", + "title": "Stuck Terminating", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -286,8 +293,10 @@ ], "fieldConfig": { "defaults": { - "min": 0, - "max": 4, + "color": { + "mode": "palette-classic" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -308,11 +317,18 @@ "value": 3 } ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" } }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -320,9 +336,7 @@ "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "value" }, "links": [ { @@ -619,7 +633,7 @@ { "id": 23, "type": "stat", - "title": "Astreae usage", + "title": "Astreae Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -690,7 +704,7 @@ { "id": 24, "type": "stat", - "title": "Asteria usage", + "title": "Asteria Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -761,7 +775,7 @@ { "id": 25, "type": "stat", - "title": "Astreae free", + "title": "Astreae Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -828,7 +842,7 @@ { "id": 26, "type": "stat", - "title": "Asteria free", + "title": "Asteria Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -895,7 +909,7 @@ { "id": 11, "type": "piechart", - "title": "Namespace CPU share", + "title": "Namespace CPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -948,7 +962,7 @@ { "id": 12, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -961,7 +975,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1001,7 +1015,7 @@ { "id": 13, "type": "piechart", - "title": "Namespace RAM share", + "title": "Namespace RAM Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1054,7 +1068,7 @@ { "id": 14, "type": "timeseries", - "title": "Worker node CPU", + "title": "Worker Node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1101,7 +1115,7 @@ { "id": 15, "type": "timeseries", - "title": "Worker node RAM", + "title": "Worker Node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1222,7 +1236,7 @@ { "id": 18, "type": "timeseries", - "title": "Cluster ingress throughput", + "title": "Cluster Ingress Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1266,7 +1280,7 @@ { "id": 19, "type": "timeseries", - "title": "Cluster egress throughput", + "title": "Cluster Egress Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1310,7 +1324,7 @@ { "id": 20, "type": "timeseries", - "title": "Intra-cluster throughput", + "title": "Intra-Cluster Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1354,7 +1368,7 @@ { "id": 21, "type": "timeseries", - "title": "Root filesystem usage", + "title": "Root Filesystem Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1402,7 +1416,7 @@ { "id": 22, "type": "bargauge", - "title": "Nodes closest to full root disks", + "title": "Nodes Closest to Full Root Disks", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1456,7 +1470,7 @@ "calcs": [ "lastNotNull" ], - "fields": "/.*/", + "fields": "Value", "values": false } }, @@ -1479,9 +1493,10 @@ "list": [] }, "time": { - "from": "now-12h", + "from": "now-1h", "to": "now" }, + "refresh": "1m", "links": [ { "title": "Atlas Pods", diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index f519d14..ef616e0 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -7,7 +7,7 @@ { "id": 1, "type": "stat", - "title": "Problem pods", + "title": "Problem Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -127,7 +127,7 @@ { "id": 3, "type": "stat", - "title": "Stuck terminating (>10m)", + "title": "Stuck Terminating (>10m)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -187,7 +187,7 @@ { "id": 4, "type": "stat", - "title": "Control plane workloads", + "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -247,7 +247,7 @@ { "id": 5, "type": "table", - "title": "Pods not running", + "title": "Pods Not Running", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json index 6585794..1d07040 100644 --- a/services/monitoring/dashboards/atlas-storage.json +++ b/services/monitoring/dashboards/atlas-storage.json @@ -7,7 +7,7 @@ { "id": 1, "type": "stat", - "title": "Astreae usage", + "title": "Astreae Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -71,7 +71,7 @@ { "id": 2, "type": "stat", - "title": "Asteria usage", + "title": "Asteria Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -135,7 +135,7 @@ { "id": 3, "type": "stat", - "title": "Astreae free", + "title": "Astreae Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -195,7 +195,7 @@ { "id": 4, "type": "stat", - "title": "Asteria free", + "title": "Asteria Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -255,7 +255,7 @@ { "id": 5, "type": "timeseries", - "title": "Astreae per-node usage", + "title": "Astreae Per-Node Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -293,7 +293,7 @@ { "id": 6, "type": "timeseries", - "title": "Asteria per-node usage", + "title": "Asteria Per-Node Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -331,7 +331,7 @@ { "id": 7, "type": "timeseries", - "title": "Astreae usage history", + "title": "Astreae Usage History", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -368,7 +368,7 @@ { "id": 8, "type": "timeseries", - "title": "Asteria usage history", + "title": "Asteria Usage History", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 13262d6..1a86c73 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -69,7 +69,7 @@ data: { "id": 2, "type": "timeseries", - "title": "GPU util by namespace", + "title": "GPU Util by Namespace", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -106,7 +106,7 @@ data: { "id": 3, "type": "timeseries", - "title": "GPU util by node", + "title": "GPU Util by Node", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -143,7 +143,7 @@ data: { "id": 4, "type": "table", - "title": "Top pods by GPU util", + "title": "Top Pods by GPU Util", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 4b78fb9..fd1f5d6 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "stat", - "title": "Ingress traffic", + "title": "Ingress Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -76,7 +76,7 @@ data: { "id": 2, "type": "stat", - "title": "Egress traffic", + "title": "Egress Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -136,7 +136,7 @@ data: { "id": 3, "type": "stat", - "title": "Intra-cluster traffic", + "title": "Intra-Cluster Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -196,7 +196,7 @@ data: { "id": 4, "type": "stat", - "title": "Top router req/s", + "title": "Top Router req/s", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -257,7 +257,7 @@ data: { "id": 5, "type": "timeseries", - "title": "Per-node throughput", + "title": "Per-Node Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -294,7 +294,7 @@ data: { "id": 6, "type": "table", - "title": "Top namespaces", + "title": "Top Namespaces", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -330,7 +330,7 @@ data: { "id": 7, "type": "table", - "title": "Top pods", + "title": "Top Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -366,7 +366,7 @@ data: { "id": 8, "type": "timeseries", - "title": "Traefik routers (req/s)", + "title": "Traefik Routers (req/s)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -403,7 +403,7 @@ data: { "id": 9, "type": "timeseries", - "title": "Traefik entrypoints (req/s)", + "title": "Traefik Entrypoints (req/s)", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index c78e994..2facfed 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "stat", - "title": "Worker nodes ready", + "title": "Worker Nodes Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -77,7 +77,7 @@ data: { "id": 2, "type": "stat", - "title": "Control plane ready", + "title": "Control Plane Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -138,7 +138,7 @@ data: { "id": 3, "type": "stat", - "title": "Control plane workloads", + "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -278,7 +278,7 @@ data: { "id": 6, "type": "timeseries", - "title": "Control plane (incl. titan-db) CPU", + "title": "Control Plane (incl. titan-db) CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -315,7 +315,7 @@ data: { "id": 7, "type": "timeseries", - "title": "Control plane (incl. titan-db) RAM", + "title": "Control Plane (incl. titan-db) RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -352,7 +352,7 @@ data: { "id": 8, "type": "timeseries", - "title": "Root filesystem usage", + "title": "Root Filesystem Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 512adf9..6fbf7c9 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -10,7 +10,7 @@ data: { "uid": "atlas-overview", "title": "Atlas Overview", - "folderUid": "atlas-overview", + "folderUid": "overview", "editable": false, "annotations": { "list": [] @@ -19,7 +19,7 @@ data: { "id": 1, "type": "gauge", - "title": "Workers ready", + "title": "Workers Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -80,7 +80,7 @@ data: { "id": 2, "type": "gauge", - "title": "Control plane ready", + "title": "Control Plane Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -133,7 +133,7 @@ data: { "id": 3, "type": "stat", - "title": "Control plane workloads", + "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -207,8 +207,8 @@ data: }, { "id": 4, - "type": "gauge", - "title": "Problem pods", + "type": "stat", + "title": "Problem Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -227,8 +227,10 @@ data: ], "fieldConfig": { "defaults": { - "min": 0, - "max": 4, + "color": { + "mode": "palette-classic" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -249,11 +251,18 @@ data: "value": 3 } ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" } }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -261,9 +270,7 @@ data: "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "value" }, "links": [ { @@ -275,8 +282,8 @@ data: }, { "id": 5, - "type": "gauge", - "title": "Stuck terminating", + "type": "stat", + "title": "Stuck Terminating", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -295,8 +302,10 @@ data: ], "fieldConfig": { "defaults": { - "min": 0, - "max": 4, + "color": { + "mode": "palette-classic" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -317,11 +326,18 @@ data: "value": 3 } ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" } }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -329,9 +345,7 @@ data: "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "value" }, "links": [ { @@ -628,7 +642,7 @@ data: { "id": 23, "type": "stat", - "title": "Astreae usage", + "title": "Astreae Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -699,7 +713,7 @@ data: { "id": 24, "type": "stat", - "title": "Asteria usage", + "title": "Asteria Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -770,7 +784,7 @@ data: { "id": 25, "type": "stat", - "title": "Astreae free", + "title": "Astreae Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -837,7 +851,7 @@ data: { "id": 26, "type": "stat", - "title": "Asteria free", + "title": "Asteria Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -904,7 +918,7 @@ data: { "id": 11, "type": "piechart", - "title": "Namespace CPU share", + "title": "Namespace CPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -957,7 +971,7 @@ data: { "id": 12, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -970,7 +984,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1010,7 +1024,7 @@ data: { "id": 13, "type": "piechart", - "title": "Namespace RAM share", + "title": "Namespace RAM Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1063,7 +1077,7 @@ data: { "id": 14, "type": "timeseries", - "title": "Worker node CPU", + "title": "Worker Node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1110,7 +1124,7 @@ data: { "id": 15, "type": "timeseries", - "title": "Worker node RAM", + "title": "Worker Node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1231,7 +1245,7 @@ data: { "id": 18, "type": "timeseries", - "title": "Cluster ingress throughput", + "title": "Cluster Ingress Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1275,7 +1289,7 @@ data: { "id": 19, "type": "timeseries", - "title": "Cluster egress throughput", + "title": "Cluster Egress Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1319,7 +1333,7 @@ data: { "id": 20, "type": "timeseries", - "title": "Intra-cluster throughput", + "title": "Intra-Cluster Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1363,7 +1377,7 @@ data: { "id": 21, "type": "timeseries", - "title": "Root filesystem usage", + "title": "Root Filesystem Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1411,7 +1425,7 @@ data: { "id": 22, "type": "bargauge", - "title": "Nodes closest to full root disks", + "title": "Nodes Closest to Full Root Disks", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1465,7 +1479,7 @@ data: "calcs": [ "lastNotNull" ], - "fields": "/.*/", + "fields": "Value", "values": false } }, @@ -1488,9 +1502,10 @@ data: "list": [] }, "time": { - "from": "now-12h", + "from": "now-1h", "to": "now" }, + "refresh": "1m", "links": [ { "title": "Atlas Pods", diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 78beca5..f92adf1 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "stat", - "title": "Problem pods", + "title": "Problem Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -136,7 +136,7 @@ data: { "id": 3, "type": "stat", - "title": "Stuck terminating (>10m)", + "title": "Stuck Terminating (>10m)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -196,7 +196,7 @@ data: { "id": 4, "type": "stat", - "title": "Control plane workloads", + "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -256,7 +256,7 @@ data: { "id": 5, "type": "table", - "title": "Pods not running", + "title": "Pods Not Running", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml index 1bbf1ea..0a534f2 100644 --- a/services/monitoring/grafana-dashboard-storage.yaml +++ b/services/monitoring/grafana-dashboard-storage.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "stat", - "title": "Astreae usage", + "title": "Astreae Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -80,7 +80,7 @@ data: { "id": 2, "type": "stat", - "title": "Asteria usage", + "title": "Asteria Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -144,7 +144,7 @@ data: { "id": 3, "type": "stat", - "title": "Astreae free", + "title": "Astreae Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -204,7 +204,7 @@ data: { "id": 4, "type": "stat", - "title": "Asteria free", + "title": "Asteria Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -264,7 +264,7 @@ data: { "id": 5, "type": "timeseries", - "title": "Astreae per-node usage", + "title": "Astreae Per-Node Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -302,7 +302,7 @@ data: { "id": 6, "type": "timeseries", - "title": "Asteria per-node usage", + "title": "Asteria Per-Node Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -340,7 +340,7 @@ data: { "id": 7, "type": "timeseries", - "title": "Astreae usage history", + "title": "Astreae Usage History", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -377,7 +377,7 @@ data: { "id": 8, "type": "timeseries", - "title": "Asteria usage history", + "title": "Asteria Usage History", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-folders.yaml b/services/monitoring/grafana-folders.yaml index c52b4e1..54b278f 100644 --- a/services/monitoring/grafana-folders.yaml +++ b/services/monitoring/grafana-folders.yaml @@ -10,8 +10,8 @@ data: folders.yaml: | apiVersion: 1 folders: - - uid: atlas-overview - title: Atlas Overview + - uid: overview + title: Overview permissions: - role: Viewer permission: View @@ -26,3 +26,10 @@ data: permission: View - role: Admin permission: Admin + - uid: oceanus-internal + title: Oceanus Internal + permissions: + - role: Editor + permission: View + - role: Admin + permission: Admin diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index cf56b27..2546dc1 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -256,6 +256,8 @@ spec: server: domain: metrics.bstein.dev root_url: https://metrics.bstein.dev/ + dashboards: + default_home_dashboard_path: /var/lib/grafana/dashboards/overview/atlas-overview.json auth.anonymous: hide_version: true users: @@ -290,7 +292,7 @@ spec: providers: - name: overview orgId: 1 - folder: Atlas Overview + folder: Overview type: file disableDeletion: false editable: false