gpu: enable time-slicing and refresh dashboards
This commit is contained in:
parent
7020d53fd8
commit
6a76fc0fa3
@ -2,6 +2,7 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ../components/device-plugin-config
|
||||
- ../components/device-plugin-jetson
|
||||
- ../components/device-plugin-minipc
|
||||
- ../components/device-plugin-tethys
|
||||
|
||||
@ -0,0 +1,15 @@
|
||||
# infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: nvidia-device-plugin-config
|
||||
namespace: kube-system
|
||||
data:
|
||||
config.yaml: |
|
||||
version: v1
|
||||
sharing:
|
||||
timeSlicing:
|
||||
renameByDefault: true
|
||||
resources:
|
||||
- name: nvidia.com/gpu
|
||||
replicas: 4
|
||||
@ -0,0 +1,5 @@
|
||||
# infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- configmap.yaml
|
||||
@ -31,6 +31,7 @@ spec:
|
||||
args:
|
||||
- "--fail-on-init-error=false"
|
||||
- "--device-list-strategy=envvar,cdi"
|
||||
- "--config-file=/config/config.yaml"
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
@ -41,7 +42,12 @@ spec:
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: config
|
||||
mountPath: /config
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
- name: config
|
||||
configMap:
|
||||
name: nvidia-device-plugin-config
|
||||
|
||||
@ -32,6 +32,7 @@ spec:
|
||||
- "--fail-on-init-error=false"
|
||||
- "--device-list-strategy=envvar"
|
||||
- "--mig-strategy=none"
|
||||
- "--config-file=/config/config.yaml"
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
@ -42,7 +43,12 @@ spec:
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: config
|
||||
mountPath: /config
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
- name: config
|
||||
configMap:
|
||||
name: nvidia-device-plugin-config
|
||||
|
||||
@ -33,6 +33,7 @@ spec:
|
||||
- "--fail-on-init-error=false"
|
||||
- "--device-list-strategy=envvar"
|
||||
- "--mig-strategy=none"
|
||||
- "--config-file=/config/config.yaml"
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
@ -43,7 +44,12 @@ spec:
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
- name: config
|
||||
mountPath: /config
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
- name: config
|
||||
configMap:
|
||||
name: nvidia-device-plugin-config
|
||||
|
||||
@ -2,4 +2,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ../components/device-plugin-config
|
||||
- ../components/device-plugin-tethys
|
||||
|
||||
@ -171,9 +171,8 @@ def node_io_expr(scope=""):
|
||||
|
||||
|
||||
def namespace_share_expr(resource_expr):
|
||||
selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
|
||||
total = f"clamp_min(sum( {selected} ), 1)"
|
||||
return f"100 * ( {selected} ) / {total}"
|
||||
total = f"clamp_min(sum( {resource_expr} ), 1)"
|
||||
return f"100 * ( {resource_expr} ) / {total}"
|
||||
|
||||
|
||||
def namespace_cpu_share_expr():
|
||||
@ -185,7 +184,10 @@ def namespace_ram_share_expr():
|
||||
|
||||
|
||||
def namespace_gpu_share_expr():
|
||||
return namespace_share_expr(NAMESPACE_GPU_RAW)
|
||||
total = f"(sum({NAMESPACE_GPU_USAGE_INSTANT}) or on() vector(0))"
|
||||
share = f"100 * ({NAMESPACE_GPU_USAGE_INSTANT}) / clamp_min({total}, 1)"
|
||||
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
|
||||
return f"({share}) or {idle}"
|
||||
|
||||
|
||||
PROBLEM_PODS_EXPR = (
|
||||
@ -270,46 +272,20 @@ STUCK_TABLE_EXPR = (
|
||||
")"
|
||||
)
|
||||
|
||||
NAMESPACE_SCOPE_WORKLOAD = 'namespace!~"(^kube.*|.*-system$|^traefik$)"'
|
||||
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
|
||||
NAMESPACE_SCOPE_INFRA = 'namespace=~"(^kube.*|.*-system$|^traefik$)"'
|
||||
NAMESPACE_SCOPE_VAR = "$namespace_scope"
|
||||
NAMESPACE_SELECTOR = f'namespace!="",pod!="",container!="",{NAMESPACE_SCOPE_VAR}'
|
||||
NAMESPACE_GPU_SELECTOR = f'namespace!="",pod!="",{NAMESPACE_SCOPE_VAR}'
|
||||
|
||||
NAMESPACE_CPU_RAW = (
|
||||
'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
|
||||
)
|
||||
NAMESPACE_RAM_RAW = (
|
||||
'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
|
||||
f'sum(rate(container_cpu_usage_seconds_total{{{NAMESPACE_SELECTOR}}}[5m])) by (namespace)'
|
||||
)
|
||||
NAMESPACE_RAM_RAW = f'sum(container_memory_working_set_bytes{{{NAMESPACE_SELECTOR}}}) by (namespace)'
|
||||
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
||||
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
||||
NAMESPACE_GPU_ALLOC = (
|
||||
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
|
||||
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
|
||||
)
|
||||
NAMESPACE_GPU_USAGE_SHARE = (
|
||||
'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))'
|
||||
)
|
||||
NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
|
||||
NAMESPACE_GPU_RAW = (
|
||||
"("
|
||||
+ NAMESPACE_GPU_USAGE_SHARE
|
||||
+ ") or on(namespace) ("
|
||||
+ NAMESPACE_CPU_RAW
|
||||
+ " * 0)"
|
||||
)
|
||||
NAMESPACE_GPU_WEIGHT = (
|
||||
"("
|
||||
+ NAMESPACE_GPU_ALLOC
|
||||
+ ") or on(namespace) ("
|
||||
+ NAMESPACE_CPU_RAW
|
||||
+ " * 0)"
|
||||
)
|
||||
NAMESPACE_ACTIVITY_SCORE = (
|
||||
"( "
|
||||
+ NAMESPACE_CPU_RAW
|
||||
+ " ) + ("
|
||||
+ NAMESPACE_RAM_RAW
|
||||
+ " / 1e9) + ("
|
||||
+ NAMESPACE_GPU_WEIGHT
|
||||
+ " * 100)"
|
||||
)
|
||||
NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
|
||||
NAMESPACE_GPU_USAGE_INSTANT = f'sum(DCGM_FI_DEV_GPU_UTIL{{{NAMESPACE_GPU_SELECTOR}}}) by (namespace)'
|
||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||
TRAEFIK_NET_INGRESS = (
|
||||
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
|
||||
@ -588,6 +564,44 @@ def pie_panel(panel_id, title, expr, grid):
|
||||
}
|
||||
|
||||
|
||||
def namespace_scope_variable():
|
||||
options = [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": NAMESPACE_SCOPE_WORKLOAD,
|
||||
"selected": True,
|
||||
},
|
||||
{"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": NAMESPACE_SCOPE_INFRA,
|
||||
"selected": False,
|
||||
},
|
||||
]
|
||||
query = (
|
||||
"workload namespaces only : "
|
||||
+ NAMESPACE_SCOPE_WORKLOAD
|
||||
+ ",all namespaces : "
|
||||
+ NAMESPACE_SCOPE_ALL
|
||||
+ ",infrastructure namespaces only : "
|
||||
+ NAMESPACE_SCOPE_INFRA
|
||||
)
|
||||
return {
|
||||
"name": "namespace_scope",
|
||||
"label": "Namespace filter",
|
||||
"type": "custom",
|
||||
"query": query,
|
||||
"current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True},
|
||||
"options": options,
|
||||
"hide": 0,
|
||||
"multi": False,
|
||||
"includeAll": False,
|
||||
"refresh": 1,
|
||||
"sort": 0,
|
||||
"skipUrlSync": False,
|
||||
}
|
||||
|
||||
|
||||
def bargauge_panel(
|
||||
panel_id,
|
||||
title,
|
||||
@ -1063,7 +1077,7 @@ def build_overview():
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["atlas", "overview"],
|
||||
"templating": {"list": []},
|
||||
"templating": {"list": [namespace_scope_variable()]},
|
||||
"time": {"from": "now-1h", "to": "now"},
|
||||
"refresh": "1m",
|
||||
"links": [],
|
||||
@ -1757,6 +1771,7 @@ def build_gpu_dashboard():
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["atlas", "gpu"],
|
||||
"templating": {"list": [namespace_scope_variable()]},
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -16,10 +16,20 @@ spec:
|
||||
app: ollama
|
||||
annotations:
|
||||
ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
|
||||
ai.bstein.dev/gpu: RTX 3080 8GB (titan-24)
|
||||
ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
|
||||
spec:
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: titan-24
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: In
|
||||
values:
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24
|
||||
runtimeClassName: nvidia
|
||||
volumes:
|
||||
- name: models
|
||||
@ -55,9 +65,9 @@ spec:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 1Gi
|
||||
nvidia.com/gpu: 1
|
||||
nvidia.com/gpu.shared: 1
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
nvidia.com/gpu.shared: 1
|
||||
containers:
|
||||
- name: ollama
|
||||
image: ollama/ollama:latest
|
||||
@ -83,8 +93,8 @@ spec:
|
||||
requests:
|
||||
cpu: "2"
|
||||
memory: 8Gi
|
||||
nvidia.com/gpu: 1
|
||||
nvidia.com/gpu.shared: 1
|
||||
limits:
|
||||
cpu: "4"
|
||||
memory: 12Gi
|
||||
nvidia.com/gpu: 1
|
||||
nvidia.com/gpu.shared: 1
|
||||
|
||||
@ -39,7 +39,7 @@ spec:
|
||||
fieldPath: spec.nodeName
|
||||
- name: AI_NODE_GPU_MAP
|
||||
value: |
|
||||
{"titan-24": "RTX 3080 8GB (local GPU)", "titan-22": "RTX 3050 8GB (local GPU)"}
|
||||
{"titan-20": "Jetson Xavier (edge GPU)", "titan-21": "Jetson Xavier (edge GPU)", "titan-22": "RTX 3050 8GB (local GPU)", "titan-24": "RTX 3080 8GB (local GPU)"}
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8080
|
||||
|
||||
@ -68,8 +68,18 @@ spec:
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /config
|
||||
nodeSelector:
|
||||
jellyfin: "true"
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: In
|
||||
values:
|
||||
- titan-20
|
||||
- titan-21
|
||||
- titan-22
|
||||
- titan-24
|
||||
securityContext:
|
||||
runAsUser: 1000
|
||||
fsGroup: 65532
|
||||
@ -96,11 +106,11 @@ spec:
|
||||
value: "002"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
nvidia.com/gpu.shared: 1
|
||||
# cpu: "4"
|
||||
# memory: 8Gi
|
||||
requests:
|
||||
nvidia.com/gpu: 1
|
||||
nvidia.com/gpu.shared: 1
|
||||
cpu: "500m"
|
||||
memory: 1Gi
|
||||
volumeMounts:
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
||||
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -71,7 +71,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
|
||||
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -182,5 +182,43 @@
|
||||
"tags": [
|
||||
"atlas",
|
||||
"gpu"
|
||||
]
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace_scope",
|
||||
"label": "Namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
"text": "all namespaces",
|
||||
"value": "namespace=~\".*\"",
|
||||
"selected": false
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
"hide": 0,
|
||||
"multi": false,
|
||||
"includeAll": false,
|
||||
"refresh": 1,
|
||||
"sort": 0,
|
||||
"skipUrlSync": false
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@ -1086,7 +1086,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
||||
"expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -1137,7 +1137,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
||||
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -1188,7 +1188,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
||||
"expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -1791,7 +1791,42 @@
|
||||
"overview"
|
||||
],
|
||||
"templating": {
|
||||
"list": []
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace_scope",
|
||||
"label": "Namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
"text": "all namespaces",
|
||||
"value": "namespace=~\".*\"",
|
||||
"selected": false
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
"hide": 0,
|
||||
"multi": false,
|
||||
"includeAll": false,
|
||||
"refresh": 1,
|
||||
"sort": 0,
|
||||
"skipUrlSync": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
|
||||
@ -29,7 +29,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
||||
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -80,7 +80,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
|
||||
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -191,5 +191,43 @@ data:
|
||||
"tags": [
|
||||
"atlas",
|
||||
"gpu"
|
||||
]
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace_scope",
|
||||
"label": "Namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
"text": "all namespaces",
|
||||
"value": "namespace=~\".*\"",
|
||||
"selected": false
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
"hide": 0,
|
||||
"multi": false,
|
||||
"includeAll": false,
|
||||
"refresh": 1,
|
||||
"sort": 0,
|
||||
"skipUrlSync": false
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@ -1095,7 +1095,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
||||
"expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -1146,7 +1146,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
||||
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -1197,7 +1197,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
||||
"expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -1800,7 +1800,42 @@ data:
|
||||
"overview"
|
||||
],
|
||||
"templating": {
|
||||
"list": []
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace_scope",
|
||||
"label": "Namespace filter",
|
||||
"type": "custom",
|
||||
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"current": {
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"selected": true
|
||||
},
|
||||
"options": [
|
||||
{
|
||||
"text": "workload namespaces only",
|
||||
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"selected": true
|
||||
},
|
||||
{
|
||||
"text": "all namespaces",
|
||||
"value": "namespace=~\".*\"",
|
||||
"selected": false
|
||||
},
|
||||
{
|
||||
"text": "infrastructure namespaces only",
|
||||
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||
"selected": false
|
||||
}
|
||||
],
|
||||
"hide": 0,
|
||||
"multi": false,
|
||||
"includeAll": false,
|
||||
"refresh": 1,
|
||||
"sort": 0,
|
||||
"skipUrlSync": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user