gpu: enable time-slicing and refresh dashboards

2026-01-01 14:16:08 -03:00 · 2026-01-01 14:16:08 -03:00 · 6a76fc0fa3
commit 6a76fc0fa3
parent 7020d53fd8
15 changed files with 288 additions and 67 deletions
--- a/infrastructure/modules/profiles/atlas-ha/kustomization.yaml
+++ b/infrastructure/modules/profiles/atlas-ha/kustomization.yaml
@ -2,6 +2,7 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
+  - ../components/device-plugin-config
  - ../components/device-plugin-jetson
  - ../components/device-plugin-minipc
  - ../components/device-plugin-tethys
--- a/infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
@ -0,0 +1,15 @@
+# infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: nvidia-device-plugin-config
+  namespace: kube-system
+data:
+  config.yaml: |
+    version: v1
+    sharing:
+      timeSlicing:
+        renameByDefault: true
+        resources:
+          - name: nvidia.com/gpu
+            replicas: 4
--- a/infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
@ -0,0 +1,5 @@
+# infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - configmap.yaml
--- a/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml
@ -31,6 +31,7 @@ spec:
          args:
            - "--fail-on-init-error=false"
            - "--device-list-strategy=envvar,cdi"
+            - "--config-file=/config/config.yaml"
          securityContext:
            privileged: true
          env:
@ -41,7 +42,12 @@ spec:
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
+            - name: config
+              mountPath: /config
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
+        - name: config
+          configMap:
+            name: nvidia-device-plugin-config
--- a/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml
@ -32,6 +32,7 @@ spec:
            - "--fail-on-init-error=false"
            - "--device-list-strategy=envvar"
            - "--mig-strategy=none"
+            - "--config-file=/config/config.yaml"
          securityContext:
            privileged: true
          env:
@ -42,7 +43,12 @@ spec:
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
+            - name: config
+              mountPath: /config
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
+        - name: config
+          configMap:
+            name: nvidia-device-plugin-config
--- a/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml
+++ b/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml
@ -33,6 +33,7 @@ spec:
            - "--fail-on-init-error=false"
            - "--device-list-strategy=envvar"
            - "--mig-strategy=none"
+            - "--config-file=/config/config.yaml"
          securityContext:
            privileged: true
          env:
@ -43,7 +44,12 @@ spec:
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
+            - name: config
+              mountPath: /config
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
+        - name: config
+          configMap:
+            name: nvidia-device-plugin-config
--- a/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml
+++ b/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml
@ -2,4 +2,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
+  - ../components/device-plugin-config
  - ../components/device-plugin-tethys
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -171,9 +171,8 @@ def node_io_expr(scope=""):


 def namespace_share_expr(resource_expr):
-    selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
-    total = f"clamp_min(sum( {selected} ), 1)"
-    return f"100 * ( {selected} ) / {total}"
+    total = f"clamp_min(sum( {resource_expr} ), 1)"
+    return f"100 * ( {resource_expr} ) / {total}"


 def namespace_cpu_share_expr():
@ -185,7 +184,10 @@ def namespace_ram_share_expr():


 def namespace_gpu_share_expr():
-    return namespace_share_expr(NAMESPACE_GPU_RAW)
+    total = f"(sum({NAMESPACE_GPU_USAGE_INSTANT}) or on() vector(0))"
+    share = f"100 * ({NAMESPACE_GPU_USAGE_INSTANT}) / clamp_min({total}, 1)"
+    idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
+    return f"({share}) or {idle}"


 PROBLEM_PODS_EXPR = (
@ -270,46 +272,20 @@ STUCK_TABLE_EXPR = (
    ")"
 )

+NAMESPACE_SCOPE_WORKLOAD = 'namespace!~"(^kube.*|.*-system$|^traefik$)"'
+NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
+NAMESPACE_SCOPE_INFRA = 'namespace=~"(^kube.*|.*-system$|^traefik$)"'
+NAMESPACE_SCOPE_VAR = "$namespace_scope"
+NAMESPACE_SELECTOR = f'namespace!="",pod!="",container!="",{NAMESPACE_SCOPE_VAR}'
+NAMESPACE_GPU_SELECTOR = f'namespace!="",pod!="",{NAMESPACE_SCOPE_VAR}'
+
 NAMESPACE_CPU_RAW = (
-    'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
-)
-NAMESPACE_RAM_RAW = (
-    'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
+    f'sum(rate(container_cpu_usage_seconds_total{{{NAMESPACE_SELECTOR}}}[5m])) by (namespace)'
 )
+NAMESPACE_RAM_RAW = f'sum(container_memory_working_set_bytes{{{NAMESPACE_SELECTOR}}}) by (namespace)'
 GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
 GPU_NODE_REGEX = "|".join(GPU_NODES)
-NAMESPACE_GPU_ALLOC = (
-    'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
-    ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
-)
-NAMESPACE_GPU_USAGE_SHARE = (
-    'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))'
-)
-NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
-NAMESPACE_GPU_RAW = (
-    "("
-    + NAMESPACE_GPU_USAGE_SHARE
-    + ") or on(namespace) ("
-    + NAMESPACE_CPU_RAW
-    + " * 0)"
-)
-NAMESPACE_GPU_WEIGHT = (
-    "("
-    + NAMESPACE_GPU_ALLOC
-    + ") or on(namespace) ("
-    + NAMESPACE_CPU_RAW
-    + " * 0)"
-)
-NAMESPACE_ACTIVITY_SCORE = (
-    "( "
-    + NAMESPACE_CPU_RAW
-    + " ) + ("
-    + NAMESPACE_RAM_RAW
-    + " / 1e9) + ("
-    + NAMESPACE_GPU_WEIGHT
-    + " * 100)"
-)
-NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
+NAMESPACE_GPU_USAGE_INSTANT = f'sum(DCGM_FI_DEV_GPU_UTIL{{{NAMESPACE_GPU_SELECTOR}}}) by (namespace)'
 TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
 TRAEFIK_NET_INGRESS = (
    'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
@ -588,6 +564,44 @@ def pie_panel(panel_id, title, expr, grid):
    }


+def namespace_scope_variable():
+    options = [
+        {
+            "text": "workload namespaces only",
+            "value": NAMESPACE_SCOPE_WORKLOAD,
+            "selected": True,
+        },
+        {"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False},
+        {
+            "text": "infrastructure namespaces only",
+            "value": NAMESPACE_SCOPE_INFRA,
+            "selected": False,
+        },
+    ]
+    query = (
+        "workload namespaces only : "
+        + NAMESPACE_SCOPE_WORKLOAD
+        + ",all namespaces : "
+        + NAMESPACE_SCOPE_ALL
+        + ",infrastructure namespaces only : "
+        + NAMESPACE_SCOPE_INFRA
+    )
+    return {
+        "name": "namespace_scope",
+        "label": "Namespace filter",
+        "type": "custom",
+        "query": query,
+        "current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True},
+        "options": options,
+        "hide": 0,
+        "multi": False,
+        "includeAll": False,
+        "refresh": 1,
+        "sort": 0,
+        "skipUrlSync": False,
+    }
+
+
 def bargauge_panel(
    panel_id,
    title,
@ -1063,7 +1077,7 @@ def build_overview():
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "overview"],
-        "templating": {"list": []},
+        "templating": {"list": [namespace_scope_variable()]},
        "time": {"from": "now-1h", "to": "now"},
        "refresh": "1m",
        "links": [],
@ -1757,6 +1771,7 @@ def build_gpu_dashboard():
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "gpu"],
+        "templating": {"list": [namespace_scope_variable()]},
    }


--- a/services/ai-llm/deployment.yaml
+++ b/services/ai-llm/deployment.yaml
@ -16,10 +16,20 @@ spec:
        app: ollama
      annotations:
        ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
-        ai.bstein.dev/gpu: RTX 3080 8GB (titan-24)
+        ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
    spec:
-      nodeSelector:
-        kubernetes.io/hostname: titan-24
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: kubernetes.io/hostname
+                    operator: In
+                    values:
+                      - titan-20
+                      - titan-21
+                      - titan-22
+                      - titan-24
      runtimeClassName: nvidia
      volumes:
        - name: models
@ -55,9 +65,9 @@ spec:
            requests:
              cpu: 250m
              memory: 1Gi
-              nvidia.com/gpu: 1
+              nvidia.com/gpu.shared: 1
            limits:
-              nvidia.com/gpu: 1
+              nvidia.com/gpu.shared: 1
      containers:
        - name: ollama
          image: ollama/ollama:latest
@ -83,8 +93,8 @@ spec:
            requests:
              cpu: "2"
              memory: 8Gi
-              nvidia.com/gpu: 1
+              nvidia.com/gpu.shared: 1
            limits:
              cpu: "4"
              memory: 12Gi
-              nvidia.com/gpu: 1
+              nvidia.com/gpu.shared: 1
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@ -39,7 +39,7 @@ spec:
                  fieldPath: spec.nodeName
            - name: AI_NODE_GPU_MAP
              value: |
-                {"titan-24": "RTX 3080 8GB (local GPU)", "titan-22": "RTX 3050 8GB (local GPU)"}
+                {"titan-20": "Jetson Xavier (edge GPU)", "titan-21": "Jetson Xavier (edge GPU)", "titan-22": "RTX 3050 8GB (local GPU)", "titan-24": "RTX 3080 8GB (local GPU)"}
          ports:
            - name: http
              containerPort: 8080
--- a/services/jellyfin/deployment.yaml
+++ b/services/jellyfin/deployment.yaml
@ -68,8 +68,18 @@ spec:
          volumeMounts:
            - name: config
              mountPath: /config
-      nodeSelector:
-        jellyfin: "true"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: kubernetes.io/hostname
+                    operator: In
+                    values:
+                      - titan-20
+                      - titan-21
+                      - titan-22
+                      - titan-24
      securityContext:
        runAsUser: 1000 
        fsGroup: 65532
@ -96,11 +106,11 @@ spec:
              value: "002"
          resources:
            limits:
-              nvidia.com/gpu: 1
+              nvidia.com/gpu.shared: 1
            #   cpu: "4"
            #   memory: 8Gi
            requests:
-              nvidia.com/gpu: 1
+              nvidia.com/gpu.shared: 1
              cpu: "500m"
              memory: 1Gi
          volumeMounts:
--- a/services/monitoring/dashboards/atlas-gpu.json
+++ b/services/monitoring/dashboards/atlas-gpu.json
@ -20,7 +20,7 @@
      },
      "targets": [
        {
-          "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+          "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
@ -71,7 +71,7 @@
      },
      "targets": [
        {
-          "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
+          "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
@ -182,5 +182,43 @@
  "tags": [
    "atlas",
    "gpu"
+  ],
+  "templating": {
+    "list": [
+      {
+        "name": "namespace_scope",
+        "label": "Namespace filter",
+        "type": "custom",
+        "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
+        "current": {
+          "text": "workload namespaces only",
+          "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
+          "selected": true
+        },
+        "options": [
+          {
+            "text": "workload namespaces only",
+            "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
+            "selected": true
+          },
+          {
+            "text": "all namespaces",
+            "value": "namespace=~\".*\"",
+            "selected": false
+          },
+          {
+            "text": "infrastructure namespaces only",
+            "value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
+            "selected": false
+          }
+        ],
+        "hide": 0,
+        "multi": false,
+        "includeAll": false,
+        "refresh": 1,
+        "sort": 0,
+        "skipUrlSync": false
+      }
    ]
  }
+}
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@ -1086,7 +1086,7 @@
      },
      "targets": [
        {
-          "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+          "expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
@ -1137,7 +1137,7 @@
      },
      "targets": [
        {
-          "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+          "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
@ -1188,7 +1188,7 @@
      },
      "targets": [
        {
-          "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+          "expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)",
          "refId": "A",
          "legendFormat": "{{namespace}}"
        }
@ -1791,7 +1791,42 @@
    "overview"
  ],
  "templating": {
-    "list": []
+    "list": [
+      {
+        "name": "namespace_scope",
+        "label": "Namespace filter",
+        "type": "custom",
+        "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
+        "current": {
+          "text": "workload namespaces only",
+          "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
+          "selected": true
+        },
+        "options": [
+          {
+            "text": "workload namespaces only",
+            "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
+            "selected": true
+          },
+          {
+            "text": "all namespaces",
+            "value": "namespace=~\".*\"",
+            "selected": false
+          },
+          {
+            "text": "infrastructure namespaces only",
+            "value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
+            "selected": false
+          }
+        ],
+        "hide": 0,
+        "multi": false,
+        "includeAll": false,
+        "refresh": 1,
+        "sort": 0,
+        "skipUrlSync": false
+      }
+    ]
  },
  "time": {
    "from": "now-1h",
--- a/services/monitoring/grafana-dashboard-gpu.yaml
+++ b/services/monitoring/grafana-dashboard-gpu.yaml
@ -29,7 +29,7 @@ data:
          },
          "targets": [
            {
-              "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+              "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
@ -80,7 +80,7 @@ data:
          },
          "targets": [
            {
-              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
@ -191,5 +191,43 @@ data:
      "tags": [
        "atlas",
        "gpu"
+      ],
+      "templating": {
+        "list": [
+          {
+            "name": "namespace_scope",
+            "label": "Namespace filter",
+            "type": "custom",
+            "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
+            "current": {
+              "text": "workload namespaces only",
+              "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
+              "selected": true
+            },
+            "options": [
+              {
+                "text": "workload namespaces only",
+                "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
+                "selected": true
+              },
+              {
+                "text": "all namespaces",
+                "value": "namespace=~\".*\"",
+                "selected": false
+              },
+              {
+                "text": "infrastructure namespaces only",
+                "value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
+                "selected": false
+              }
+            ],
+            "hide": 0,
+            "multi": false,
+            "includeAll": false,
+            "refresh": 1,
+            "sort": 0,
+            "skipUrlSync": false
+          }
        ]
      }
+    }
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@ -1095,7 +1095,7 @@ data:
          },
          "targets": [
            {
-              "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+              "expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
@ -1146,7 +1146,7 @@ data:
          },
          "targets": [
            {
-              "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+              "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
@ -1197,7 +1197,7 @@ data:
          },
          "targets": [
            {
-              "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
+              "expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)",
              "refId": "A",
              "legendFormat": "{{namespace}}"
            }
@ -1800,7 +1800,42 @@ data:
        "overview"
      ],
      "templating": {
-        "list": []
+        "list": [
+          {
+            "name": "namespace_scope",
+            "label": "Namespace filter",
+            "type": "custom",
+            "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
+            "current": {
+              "text": "workload namespaces only",
+              "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
+              "selected": true
+            },
+            "options": [
+              {
+                "text": "workload namespaces only",
+                "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
+                "selected": true
+              },
+              {
+                "text": "all namespaces",
+                "value": "namespace=~\".*\"",
+                "selected": false
+              },
+              {
+                "text": "infrastructure namespaces only",
+                "value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
+                "selected": false
+              }
+            ],
+            "hide": 0,
+            "multi": false,
+            "includeAll": false,
+            "refresh": 1,
+            "sort": 0,
+            "skipUrlSync": false
+          }
+        ]
      },
      "time": {
        "from": "now-1h",