diff --git a/infrastructure/modules/profiles/atlas-ha/kustomization.yaml b/infrastructure/modules/profiles/atlas-ha/kustomization.yaml index 7e69171..0502e01 100644 --- a/infrastructure/modules/profiles/atlas-ha/kustomization.yaml +++ b/infrastructure/modules/profiles/atlas-ha/kustomization.yaml @@ -2,6 +2,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + - ../components/device-plugin-config - ../components/device-plugin-jetson - ../components/device-plugin-minipc - ../components/device-plugin-tethys diff --git a/infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml b/infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml new file mode 100644 index 0000000..73c61cf --- /dev/null +++ b/infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml @@ -0,0 +1,15 @@ +# infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: nvidia-device-plugin-config + namespace: kube-system +data: + config.yaml: | + version: v1 + sharing: + timeSlicing: + renameByDefault: true + resources: + - name: nvidia.com/gpu + replicas: 4 diff --git a/infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml b/infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml new file mode 100644 index 0000000..346f526 --- /dev/null +++ b/infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml @@ -0,0 +1,5 @@ +# infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - configmap.yaml diff --git a/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml b/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml index f4953ea..0fa8376 100644 --- a/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml +++ b/infrastructure/modules/profiles/components/device-plugin-jetson/daemonset.yaml @@ -31,6 +31,7 @@ spec: args: - "--fail-on-init-error=false" - "--device-list-strategy=envvar,cdi" + - "--config-file=/config/config.yaml" securityContext: privileged: true env: @@ -41,7 +42,12 @@ spec: volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins + - name: config + mountPath: /config volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins + - name: config + configMap: + name: nvidia-device-plugin-config diff --git a/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml b/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml index 76b6c06..309593a 100644 --- a/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml +++ b/infrastructure/modules/profiles/components/device-plugin-minipc/daemonset.yaml @@ -32,6 +32,7 @@ spec: - "--fail-on-init-error=false" - "--device-list-strategy=envvar" - "--mig-strategy=none" + - "--config-file=/config/config.yaml" securityContext: privileged: true env: @@ -42,7 +43,12 @@ spec: volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins + - name: config + mountPath: /config volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins + - name: config + configMap: + name: nvidia-device-plugin-config diff --git a/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml b/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml index a15930a..884befa 100644 --- a/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml +++ b/infrastructure/modules/profiles/components/device-plugin-tethys/daemonset.yaml @@ -33,6 +33,7 @@ spec: - "--fail-on-init-error=false" - "--device-list-strategy=envvar" - "--mig-strategy=none" + - "--config-file=/config/config.yaml" securityContext: privileged: true env: @@ -43,7 +44,12 @@ spec: volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins + - name: config + mountPath: /config volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins + - name: config + configMap: + name: nvidia-device-plugin-config diff --git a/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml b/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml index b55c059..ad951ec 100644 --- a/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml +++ b/infrastructure/modules/profiles/tethys-hybrid/kustomization.yaml @@ -2,4 +2,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + - ../components/device-plugin-config - ../components/device-plugin-tethys diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 7ad117b..7994cf7 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -171,9 +171,8 @@ def node_io_expr(scope=""): def namespace_share_expr(resource_expr): - selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )" - total = f"clamp_min(sum( {selected} ), 1)" - return f"100 * ( {selected} ) / {total}" + total = f"clamp_min(sum( {resource_expr} ), 1)" + return f"100 * ( {resource_expr} ) / {total}" def namespace_cpu_share_expr(): @@ -185,7 +184,10 @@ def namespace_ram_share_expr(): def namespace_gpu_share_expr(): - return namespace_share_expr(NAMESPACE_GPU_RAW) + total = f"(sum({NAMESPACE_GPU_USAGE_INSTANT}) or on() vector(0))" + share = f"100 * ({NAMESPACE_GPU_USAGE_INSTANT}) / clamp_min({total}, 1)" + idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)" + return f"({share}) or {idle}" PROBLEM_PODS_EXPR = ( @@ -270,46 +272,20 @@ STUCK_TABLE_EXPR = ( ")" ) +NAMESPACE_SCOPE_WORKLOAD = 'namespace!~"(^kube.*|.*-system$|^traefik$)"' +NAMESPACE_SCOPE_ALL = 'namespace=~".*"' +NAMESPACE_SCOPE_INFRA = 'namespace=~"(^kube.*|.*-system$|^traefik$)"' +NAMESPACE_SCOPE_VAR = "$namespace_scope" +NAMESPACE_SELECTOR = f'namespace!="",pod!="",container!="",{NAMESPACE_SCOPE_VAR}' +NAMESPACE_GPU_SELECTOR = f'namespace!="",pod!="",{NAMESPACE_SCOPE_VAR}' + NAMESPACE_CPU_RAW = ( - 'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)' -) -NAMESPACE_RAM_RAW = ( - 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' + f'sum(rate(container_cpu_usage_seconds_total{{{NAMESPACE_SELECTOR}}}[5m])) by (namespace)' ) +NAMESPACE_RAM_RAW = f'sum(container_memory_working_set_bytes{{{NAMESPACE_SELECTOR}}}) by (namespace)' GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) -NAMESPACE_GPU_ALLOC = ( - 'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}' - ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)' -) -NAMESPACE_GPU_USAGE_SHARE = ( - 'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))' -) -NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)' -NAMESPACE_GPU_RAW = ( - "(" - + NAMESPACE_GPU_USAGE_SHARE - + ") or on(namespace) (" - + NAMESPACE_CPU_RAW - + " * 0)" -) -NAMESPACE_GPU_WEIGHT = ( - "(" - + NAMESPACE_GPU_ALLOC - + ") or on(namespace) (" - + NAMESPACE_CPU_RAW - + " * 0)" -) -NAMESPACE_ACTIVITY_SCORE = ( - "( " - + NAMESPACE_CPU_RAW - + " ) + (" - + NAMESPACE_RAM_RAW - + " / 1e9) + (" - + NAMESPACE_GPU_WEIGHT - + " * 100)" -) -NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)" +NAMESPACE_GPU_USAGE_INSTANT = f'sum(DCGM_FI_DEV_GPU_UTIL{{{NAMESPACE_GPU_SELECTOR}}}) by (namespace)' TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" TRAEFIK_NET_INGRESS = ( 'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' @@ -588,6 +564,44 @@ def pie_panel(panel_id, title, expr, grid): } +def namespace_scope_variable(): + options = [ + { + "text": "workload namespaces only", + "value": NAMESPACE_SCOPE_WORKLOAD, + "selected": True, + }, + {"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False}, + { + "text": "infrastructure namespaces only", + "value": NAMESPACE_SCOPE_INFRA, + "selected": False, + }, + ] + query = ( + "workload namespaces only : " + + NAMESPACE_SCOPE_WORKLOAD + + ",all namespaces : " + + NAMESPACE_SCOPE_ALL + + ",infrastructure namespaces only : " + + NAMESPACE_SCOPE_INFRA + ) + return { + "name": "namespace_scope", + "label": "Namespace filter", + "type": "custom", + "query": query, + "current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True}, + "options": options, + "hide": 0, + "multi": False, + "includeAll": False, + "refresh": 1, + "sort": 0, + "skipUrlSync": False, + } + + def bargauge_panel( panel_id, title, @@ -1063,7 +1077,7 @@ def build_overview(): "schemaVersion": 39, "style": "dark", "tags": ["atlas", "overview"], - "templating": {"list": []}, + "templating": {"list": [namespace_scope_variable()]}, "time": {"from": "now-1h", "to": "now"}, "refresh": "1m", "links": [], @@ -1757,6 +1771,7 @@ def build_gpu_dashboard(): "schemaVersion": 39, "style": "dark", "tags": ["atlas", "gpu"], + "templating": {"list": [namespace_scope_variable()]}, } diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml index fb0d0e7..b74dc0a 100644 --- a/services/ai-llm/deployment.yaml +++ b/services/ai-llm/deployment.yaml @@ -16,10 +16,20 @@ spec: app: ollama annotations: ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0 - ai.bstein.dev/gpu: RTX 3080 8GB (titan-24) + ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24) spec: - nodeSelector: - kubernetes.io/hostname: titan-24 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - titan-20 + - titan-21 + - titan-22 + - titan-24 runtimeClassName: nvidia volumes: - name: models @@ -55,9 +65,9 @@ spec: requests: cpu: 250m memory: 1Gi - nvidia.com/gpu: 1 + nvidia.com/gpu.shared: 1 limits: - nvidia.com/gpu: 1 + nvidia.com/gpu.shared: 1 containers: - name: ollama image: ollama/ollama:latest @@ -83,8 +93,8 @@ spec: requests: cpu: "2" memory: 8Gi - nvidia.com/gpu: 1 + nvidia.com/gpu.shared: 1 limits: cpu: "4" memory: 12Gi - nvidia.com/gpu: 1 + nvidia.com/gpu.shared: 1 diff --git a/services/bstein-dev-home/backend-deployment.yaml b/services/bstein-dev-home/backend-deployment.yaml index 2a03a24..21f74ba 100644 --- a/services/bstein-dev-home/backend-deployment.yaml +++ b/services/bstein-dev-home/backend-deployment.yaml @@ -39,7 +39,7 @@ spec: fieldPath: spec.nodeName - name: AI_NODE_GPU_MAP value: | - {"titan-24": "RTX 3080 8GB (local GPU)", "titan-22": "RTX 3050 8GB (local GPU)"} + {"titan-20": "Jetson Xavier (edge GPU)", "titan-21": "Jetson Xavier (edge GPU)", "titan-22": "RTX 3050 8GB (local GPU)", "titan-24": "RTX 3080 8GB (local GPU)"} ports: - name: http containerPort: 8080 diff --git a/services/jellyfin/deployment.yaml b/services/jellyfin/deployment.yaml index 88fa9dd..1177a06 100644 --- a/services/jellyfin/deployment.yaml +++ b/services/jellyfin/deployment.yaml @@ -68,8 +68,18 @@ spec: volumeMounts: - name: config mountPath: /config - nodeSelector: - jellyfin: "true" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - titan-20 + - titan-21 + - titan-22 + - titan-24 securityContext: runAsUser: 1000 fsGroup: 65532 @@ -96,11 +106,11 @@ spec: value: "002" resources: limits: - nvidia.com/gpu: 1 + nvidia.com/gpu.shared: 1 # cpu: "4" # memory: 8Gi requests: - nvidia.com/gpu: 1 + nvidia.com/gpu.shared: 1 cpu: "500m" memory: 1Gi volumeMounts: diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 572c2c6..e0a631b 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -71,7 +71,7 @@ }, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -182,5 +182,43 @@ "tags": [ "atlas", "gpu" - ] + ], + "templating": { + "list": [ + { + "name": "namespace_scope", + "label": "Namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "selected": false + } + ], + "hide": 0, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + } + ] + } } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 70062e0..087b9af 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1086,7 +1086,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1137,7 +1137,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1188,7 +1188,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1791,7 +1791,42 @@ "overview" ], "templating": { - "list": [] + "list": [ + { + "name": "namespace_scope", + "label": "Namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "selected": false + } + ], + "hide": 0, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + } + ] }, "time": { "from": "now-1h", diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 48725de..b90dddb 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -80,7 +80,7 @@ data: }, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -191,5 +191,43 @@ data: "tags": [ "atlas", "gpu" - ] + ], + "templating": { + "list": [ + { + "name": "namespace_scope", + "label": "Namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "selected": false + } + ], + "hide": 0, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + } + ] + } } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index cfd2cd6..b2aca02 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1095,7 +1095,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1146,7 +1146,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1197,7 +1197,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1800,7 +1800,42 @@ data: "overview" ], "templating": { - "list": [] + "list": [ + { + "name": "namespace_scope", + "label": "Namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "selected": false + } + ], + "hide": 0, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + } + ] }, "time": { "from": "now-1h",