From eed67b3db01c493bce6ba5aa6ef8e70d1ec4a041 Mon Sep 17 00:00:00 2001
From: Brad Stein <Brad.Stein@gmail.com>
Date: Tue, 2 Dec 2025 13:16:00 -0300
Subject: [PATCH] monitoring: regen dashboards with gpu details

---
 AGENTS.md                                     |  42 ++++
 ...shboards.py => dashboards_render_atlas.py} | 177 ++++++++++++----
 services/monitoring/dashboards/atlas-gpu.json | 184 +++++++++++++++++
 .../monitoring/dashboards/atlas-network.json  |   5 +-
 .../monitoring/dashboards/atlas-overview.json |  14 +-
 .../monitoring/grafana-dashboard-gpu.yaml     | 193 ++++++++++++++++++
 .../monitoring/grafana-dashboard-network.yaml |   5 +-
 .../grafana-dashboard-overview.yaml           |  14 +-
 services/monitoring/helmrelease.yaml          |   9 +
 services/monitoring/kustomization.yaml        |   1 +
 10 files changed, 584 insertions(+), 60 deletions(-)
 create mode 100644 AGENTS.md
 rename scripts/{render_dashboards.py => dashboards_render_atlas.py} (90%)
 create mode 100644 services/monitoring/dashboards/atlas-gpu.json
 create mode 100644 services/monitoring/grafana-dashboard-gpu.yaml
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..05838aa
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,42 @@
+
+
+Repository Guidelines
+
+## Project Structure & Module Organization
+- `infrastructure/`: cluster-scoped building blocks (core, flux-system, traefik, longhorn). Add new platform features by mirroring this layout.
+- `services/`: workload manifests per app (`services/gitea/`, etc.) with `kustomization.yaml` plus one file per kind; keep diffs small and focused.
+- `dockerfiles/` hosts bespoke images, while `scripts/` stores operational Fish/Bash helpers—extend these directories instead of relying on ad-hoc commands.
+
+## Build, Test, and Development Commands
+- `kustomize build services/<app>` (or `kubectl kustomize ...`) renders manifests exactly as Flux will.
+- `kubectl apply --server-side --dry-run=client -k services/<app>` checks schema compatibility without touching the cluster.
+- `flux reconcile kustomization <name> --namespace flux-system --with-source` pulls the latest Git state after merges or hotfixes.
+- `fish scripts/flux_hammer.fish --help` explains the recovery tool; read it before running against production workloads.
+
+## Coding Style & Naming Conventions
+- YAML uses two-space indents; retain the leading path comment (e.g. `# services/gitea/deployment.yaml`) to speed code review.
+- Keep resource names lowercase kebab-case, align labels/selectors, and mirror namespaces with directory names.
+- List resources in `kustomization.yaml` from namespace/config, through storage, then workloads and networking for predictable diffs.
+- Scripts start with `#!/usr/bin/env fish` or bash, stay executable, and follow snake_case names such as `flux_hammer.fish`.
+
+## Testing Guidelines
+- Run `kustomize build` and the dry-run apply for every service you touch; capture failures before opening a PR.
+- `flux diff kustomization <name> --path services/<app>` previews reconciliations—link notable output when behavior shifts.
+- Docker edits: `docker build -f dockerfiles/Dockerfile.monerod .` (swap the file you changed) to verify image builds.
+
+## Commit & Pull Request Guidelines
+- Keep commit subjects short, present-tense, and optionally scoped (`gpu(titan-24): add RuntimeClass`); squash fixups before review.
+- Describe linked issues, affected services, and required operator steps (e.g. `flux reconcile kustomization services-gitea`) in the PR body.
+- Focus each PR on one kustomization or service and update `infrastructure/flux-system` when Flux must track new folders.
+- Record the validation you ran (dry-runs, diffs, builds) and add screenshots only when ingress or UI behavior changes.
+
+## Security & Configuration Tips
+- Never commit credentials; use Vault workflows (`services/vault/`) or SOPS-encrypted manifests wired through `infrastructure/flux-system`.
+- Node selectors and tolerations gate workloads to hardware like `hardware: rpi4`; confirm labels before scaling or renaming nodes.
+- Pin external images by digest or rely on Flux image automation to follow approved tags and avoid drift.
+
+## Dashboard roadmap / context (2025-12-02)
+- Atlas dashboards are generated via `scripts/dashboards_render_atlas.py --build`, which writes JSON under `services/monitoring/dashboards/` and ConfigMaps under `services/monitoring/`. Keep the Grafana manifests in sync by regenerating after edits.
+- Atlas Overview panels are paired with internal dashboards (pods, nodes, storage, network, GPU). A new `atlas-gpu` internal dashboard holds the detailed GPU metrics that feed the overview share pie.
+- Old Grafana folders (`Atlas Storage`, `Atlas SRE`, `Atlas Public`, `Atlas Nodes`) should be removed in Grafana UI when convenient; only `Atlas Overview` and `Atlas Internal` should remain provisioned.
+- Future work: add a separate generator (e.g., `dashboards_render_oceanus.py`) for SUI/oceanus validation dashboards, mirroring the atlas pattern of internal dashboards feeding a public overview.
diff --git a/scripts/render_dashboards.py b/scripts/dashboards_render_atlas.py
similarity index 90%
rename from scripts/render_dashboards.py
rename to scripts/dashboards_render_atlas.py
index 812a931..97070d2 100644
--- a/scripts/render_dashboards.py
+++ b/scripts/dashboards_render_atlas.py
@@ -2,8 +2,8 @@
 """Generate Atlas Grafana dashboards and render them into ConfigMaps.
 
 Usage:
-  scripts/render_dashboards.py --build   # rebuild JSON + ConfigMaps
-  scripts/render_dashboards.py           # re-render ConfigMaps from JSON
+  scripts/dashboards_render_atlas.py --build   # rebuild JSON + ConfigMaps
+  scripts/dashboards_render_atlas.py           # re-render ConfigMaps from JSON
 """
 
 import argparse
@@ -198,7 +198,6 @@ STUCK_TERMINATING_EXPR = (
     ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
     '))'
 )
-
 PROBLEM_TABLE_EXPR = (
     "(time() - kube_pod_created{pod!=\"\"}) "
     "* on(namespace,pod) group_left(node) kube_pod_info "
@@ -489,6 +488,47 @@ def pie_panel(panel_id, title, expr, grid):
     }
 
 
+def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
+    """Return a bar gauge panel with label-aware reduction."""
+    panel = {
+        "id": panel_id,
+        "type": "bargauge",
+        "title": title,
+        "datasource": PROM_DS,
+        "gridPos": grid,
+        "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}],
+        "fieldConfig": {
+            "defaults": {
+                "unit": unit,
+                "min": 0,
+                "max": 100 if unit == "percent" else None,
+                "thresholds": {
+                    "mode": "absolute",
+                    "steps": [
+                        {"color": "green", "value": None},
+                        {"color": "yellow", "value": 50},
+                        {"color": "orange", "value": 70},
+                        {"color": "red", "value": 85},
+                    ],
+                },
+            },
+            "overrides": [],
+        },
+        "options": {
+            "displayMode": "gradient",
+            "orientation": "horizontal",
+            "reduceOptions": {
+                "calcs": ["lastNotNull"],
+                "fields": "/.*/",
+                "values": False,
+            },
+        },
+    }
+    if links:
+        panel["links"] = links
+    return panel
+
+
 def text_panel(panel_id, title, content, grid):
     return {
         "id": panel_id,
@@ -554,6 +594,7 @@ def build_overview():
             link_to("atlas-pods"),
         ),
     ]
+
     def gauge_grid(idx):
         width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
         x = sum(GAUGE_WIDTHS[:idx])
@@ -806,38 +847,14 @@ def build_overview():
         )
     )
     panels.append(
-        {
-            "id": 22,
-            "type": "bargauge",
-            "title": "Nodes closest to full root disks",
-            "datasource": PROM_DS,
-            "gridPos": {"h": 8, "w": 12, "x": 12, "y": 47},
-            "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
-            "fieldConfig": {
-                "defaults": {
-                    "unit": "percent",
-                    "min": 0,
-                    "max": 100,
-                    "thresholds": {
-                        "mode": "absolute",
-                        "steps": [
-                            {"color": "green", "value": None},
-                            {"color": "yellow", "value": 50},
-                            {"color": "orange", "value": 70},
-                            {"color": "red", "value": 85},
-                        ],
-                    },
-                },
-                "overrides": [],
-            },
-            "options": {
-                "displayMode": "gradient",
-                "orientation": "horizontal",
-                "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
-            },
-            "links": link_to("atlas-storage"),
-            "transformations": [{"id": "labelsToFields", "options": {}}],
-        }
+        bargauge_panel(
+            22,
+            "Nodes closest to full root disks",
+            f"topk(8, {root_usage_expr()})",
+            {"h": 8, "w": 12, "x": 12, "y": 47},
+            unit="percent",
+            links=link_to("atlas-storage"),
+        )
     )
 
     return {
@@ -857,6 +874,7 @@ def build_overview():
             {"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
             {"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
             {"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False},
+            {"title": "Atlas GPU", "type": "dashboard", "dashboardUid": "atlas-gpu", "keepTime": False},
         ],
     }
 
@@ -1179,13 +1197,31 @@ def build_storage_dashboard():
 def build_network_dashboard():
     panels = []
     panels.append(
-        stat_panel(1, "Ingress traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="Bps")
+        stat_panel(
+            1,
+            "Ingress traffic",
+            NET_INGRESS_EXPR,
+            {"h": 4, "w": 8, "x": 0, "y": 0},
+            unit="Bps",
+        )
     )
     panels.append(
-        stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps")
+        stat_panel(
+            2,
+            "Egress traffic",
+            NET_EGRESS_EXPR,
+            {"h": 4, "w": 8, "x": 8, "y": 0},
+            unit="Bps",
+        )
     )
     panels.append(
-        stat_panel(3, "Intra-cluster traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 0}, unit="Bps")
+        stat_panel(
+            3,
+            "Intra-cluster traffic",
+            NET_INTERNAL_EXPR,
+            {"h": 4, "w": 8, "x": 16, "y": 0},
+            unit="Bps",
+        )
     )
     panels.append(
         stat_panel(
@@ -1195,14 +1231,13 @@ def build_network_dashboard():
             {"h": 4, "w": 8, "x": 0, "y": 4},
             unit="req/s",
             legend="{{router}}",
-            instant=True,
         )
     )
     panels.append(
         timeseries_panel(
             5,
             "Per-node throughput",
-            node_net_expr(),
+            f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
             {"h": 8, "w": 24, "x": 0, "y": 8},
             unit="Bps",
             legend="{{node}}",
@@ -1270,6 +1305,64 @@ def build_network_dashboard():
     }
 
 
+def build_gpu_dashboard():
+    panels = []
+    panels.append(
+        pie_panel(
+            1,
+            "Namespace GPU share",
+            namespace_gpu_share_expr(),
+            {"h": 8, "w": 12, "x": 0, "y": 0},
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            2,
+            "GPU util by namespace",
+            NAMESPACE_GPU_USAGE,
+            {"h": 8, "w": 12, "x": 12, "y": 0},
+            unit="percent",
+            legend="{{namespace}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        timeseries_panel(
+            3,
+            "GPU util by node",
+            'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
+            {"h": 8, "w": 12, "x": 0, "y": 8},
+            unit="percent",
+            legend="{{Hostname}}",
+            legend_display="table",
+            legend_placement="right",
+        )
+    )
+    panels.append(
+        table_panel(
+            4,
+            "Top pods by GPU util",
+            'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
+            {"h": 8, "w": 12, "x": 12, "y": 8},
+            unit="percent",
+            transformations=[{"id": "labelsToFields", "options": {}}],
+        )
+    )
+    return {
+        "uid": "atlas-gpu",
+        "title": "Atlas GPU",
+        "folderUid": PRIVATE_FOLDER,
+        "editable": True,
+        "panels": panels,
+        "time": {"from": "now-12h", "to": "now"},
+        "annotations": {"list": []},
+        "schemaVersion": 39,
+        "style": "dark",
+        "tags": ["atlas", "gpu"],
+    }
+
+
 DASHBOARDS = {
     "atlas-overview": {
         "builder": build_overview,
@@ -1291,6 +1384,10 @@ DASHBOARDS = {
         "builder": build_network_dashboard,
         "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
     },
+    "atlas-gpu": {
+        "builder": build_gpu_dashboard,
+        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
+    },
 }
 
 
diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json
new file mode 100644
index 0000000..da235a5
--- /dev/null
+++ b/services/monitoring/dashboards/atlas-gpu.json
@@ -0,0 +1,184 @@
+{
+  "uid": "atlas-gpu",
+  "title": "Atlas GPU",
+  "folderUid": "atlas-internal",
+  "editable": true,
+  "panels": [
+    {
+      "id": 1,
+      "type": "piechart",
+      "title": "Namespace GPU share",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "right"
+        },
+        "pieType": "pie",
+        "displayLabels": [
+          "percent"
+        ],
+        "tooltip": {
+          "mode": "single"
+        },
+        "colorScheme": "interpolateSpectral",
+        "colorBy": "value",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        }
+      }
+    },
+    {
+      "id": 2,
+      "type": "timeseries",
+      "title": "GPU util by namespace",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "targets": [
+        {
+          "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
+          "refId": "A",
+          "legendFormat": "{{namespace}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 3,
+      "type": "timeseries",
+      "title": "GPU util by node",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "targets": [
+        {
+          "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
+          "refId": "A",
+          "legendFormat": "{{Hostname}}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 4,
+      "type": "table",
+      "title": "Top pods by GPU util",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "atlas-vm"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "targets": [
+        {
+          "expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "options": {
+        "showHeader": true
+      },
+      "transformations": [
+        {
+          "id": "labelsToFields",
+          "options": {}
+        }
+      ]
+    }
+  ],
+  "time": {
+    "from": "now-12h",
+    "to": "now"
+  },
+  "annotations": {
+    "list": []
+  },
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": [
+    "atlas",
+    "gpu"
+  ]
+}
diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json
index 9005eb9..f2291b7 100644
--- a/services/monitoring/dashboards/atlas-network.json
+++ b/services/monitoring/dashboards/atlas-network.json
@@ -202,8 +202,7 @@
         {
           "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
           "refId": "A",
-          "legendFormat": "{{router}}",
-          "instant": true
+          "legendFormat": "{{router}}"
         }
       ],
       "fieldConfig": {
@@ -262,7 +261,7 @@
       },
       "targets": [
         {
-          "expr": "avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+          "expr": "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
           "refId": "A",
           "legendFormat": "{{node}}"
         }
diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json
index 93a246b..4e3c357 100644
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@@ -1456,7 +1456,7 @@
           "calcs": [
             "lastNotNull"
           ],
-          "fields": "",
+          "fields": "/.*/",
           "values": false
         }
       },
@@ -1466,12 +1466,6 @@
           "url": "/d/atlas-storage",
           "targetBlank": true
         }
-      ],
-      "transformations": [
-        {
-          "id": "labelsToFields",
-          "options": {}
-        }
       ]
     }
   ],
@@ -1512,6 +1506,12 @@
       "type": "dashboard",
       "dashboardUid": "atlas-network",
       "keepTime": false
+    },
+    {
+      "title": "Atlas GPU",
+      "type": "dashboard",
+      "dashboardUid": "atlas-gpu",
+      "keepTime": false
     }
   ]
 }
diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml
new file mode 100644
index 0000000..13262d6
--- /dev/null
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@@ -0,0 +1,193 @@
+# services/monitoring/grafana-dashboard-gpu.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-gpu
+  labels:
+    grafana_dashboard: "1"
+data:
+  atlas-gpu.json: |
+    {
+      "uid": "atlas-gpu",
+      "title": "Atlas GPU",
+      "folderUid": "atlas-internal",
+      "editable": true,
+      "panels": [
+        {
+          "id": 1,
+          "type": "piechart",
+          "title": "Namespace GPU share",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent",
+              "color": {
+                "mode": "palette-classic"
+              }
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "list",
+              "placement": "right"
+            },
+            "pieType": "pie",
+            "displayLabels": [
+              "percent"
+            ],
+            "tooltip": {
+              "mode": "single"
+            },
+            "colorScheme": "interpolateSpectral",
+            "colorBy": "value",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            }
+          }
+        },
+        {
+          "id": 2,
+          "type": "timeseries",
+          "title": "GPU util by namespace",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 0
+          },
+          "targets": [
+            {
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
+              "refId": "A",
+              "legendFormat": "{{namespace}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 3,
+          "type": "timeseries",
+          "title": "GPU util by node",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 8
+          },
+          "targets": [
+            {
+              "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})",
+              "refId": "A",
+              "legendFormat": "{{Hostname}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "legend": {
+              "displayMode": "table",
+              "placement": "right"
+            },
+            "tooltip": {
+              "mode": "multi"
+            }
+          }
+        },
+        {
+          "id": 4,
+          "type": "table",
+          "title": "Top pods by GPU util",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "atlas-vm"
+          },
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 8
+          },
+          "targets": [
+            {
+              "expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))",
+              "refId": "A"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": {
+              "unit": "percent"
+            },
+            "overrides": []
+          },
+          "options": {
+            "showHeader": true
+          },
+          "transformations": [
+            {
+              "id": "labelsToFields",
+              "options": {}
+            }
+          ]
+        }
+      ],
+      "time": {
+        "from": "now-12h",
+        "to": "now"
+      },
+      "annotations": {
+        "list": []
+      },
+      "schemaVersion": 39,
+      "style": "dark",
+      "tags": [
+        "atlas",
+        "gpu"
+      ]
+    }
diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml
index d2372de..4b78fb9 100644
--- a/services/monitoring/grafana-dashboard-network.yaml
+++ b/services/monitoring/grafana-dashboard-network.yaml
@@ -211,8 +211,7 @@ data:
             {
               "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
               "refId": "A",
-              "legendFormat": "{{router}}",
-              "instant": true
+              "legendFormat": "{{router}}"
             }
           ],
           "fieldConfig": {
@@ -271,7 +270,7 @@ data:
           },
           "targets": [
             {
-              "expr": "avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
+              "expr": "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
               "refId": "A",
               "legendFormat": "{{node}}"
             }
diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml
index ebd9b2b..512adf9 100644
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@@ -1465,7 +1465,7 @@ data:
               "calcs": [
                 "lastNotNull"
               ],
-              "fields": "",
+              "fields": "/.*/",
               "values": false
             }
           },
@@ -1475,12 +1475,6 @@ data:
               "url": "/d/atlas-storage",
               "targetBlank": true
             }
-          ],
-          "transformations": [
-            {
-              "id": "labelsToFields",
-              "options": {}
-            }
           ]
         }
       ],
@@ -1521,6 +1515,12 @@ data:
           "type": "dashboard",
           "dashboardUid": "atlas-network",
           "keepTime": false
+        },
+        {
+          "title": "Atlas GPU",
+          "type": "dashboard",
+          "dashboardUid": "atlas-gpu",
+          "keepTime": false
         }
       ]
     }
diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml
index 5a8f1ba..cf56b27 100644
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@@ -320,6 +320,14 @@ spec:
             editable: true
             options:
               path: /var/lib/grafana/dashboards/storage
+          - name: gpu
+            orgId: 1
+            folder: Atlas Internal
+            type: file
+            disableDeletion: false
+            editable: true
+            options:
+              path: /var/lib/grafana/dashboards/gpu
           - name: network
             orgId: 1
             folder: Atlas Internal
@@ -333,6 +341,7 @@ spec:
       pods: grafana-dashboard-pods
       nodes: grafana-dashboard-nodes
       storage: grafana-dashboard-storage
+      gpu: grafana-dashboard-gpu
       network: grafana-dashboard-network
     extraConfigmapMounts:
       - name: grafana-folders
diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml
index 3164862..a50a1c1 100644
--- a/services/monitoring/kustomization.yaml
+++ b/services/monitoring/kustomization.yaml
@@ -10,6 +10,7 @@ resources:
   - grafana-dashboard-nodes.yaml
   - grafana-dashboard-storage.yaml
   - grafana-dashboard-network.yaml
+  - grafana-dashboard-gpu.yaml
   - dcgm-exporter.yaml
   - grafana-folders.yaml
   - helmrelease.yaml