gpu: enable time-slicing and refresh dashboards
This commit is contained in:
parent
7020d53fd8
commit
6a76fc0fa3
@ -2,6 +2,7 @@
|
|||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
resources:
|
resources:
|
||||||
|
- ../components/device-plugin-config
|
||||||
- ../components/device-plugin-jetson
|
- ../components/device-plugin-jetson
|
||||||
- ../components/device-plugin-minipc
|
- ../components/device-plugin-minipc
|
||||||
- ../components/device-plugin-tethys
|
- ../components/device-plugin-tethys
|
||||||
|
|||||||
@ -0,0 +1,15 @@
|
|||||||
|
# infrastructure/modules/profiles/components/device-plugin-config/configmap.yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: nvidia-device-plugin-config
|
||||||
|
namespace: kube-system
|
||||||
|
data:
|
||||||
|
config.yaml: |
|
||||||
|
version: v1
|
||||||
|
sharing:
|
||||||
|
timeSlicing:
|
||||||
|
renameByDefault: true
|
||||||
|
resources:
|
||||||
|
- name: nvidia.com/gpu
|
||||||
|
replicas: 4
|
||||||
@ -0,0 +1,5 @@
|
|||||||
|
# infrastructure/modules/profiles/components/device-plugin-config/kustomization.yaml
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- configmap.yaml
|
||||||
@ -31,6 +31,7 @@ spec:
|
|||||||
args:
|
args:
|
||||||
- "--fail-on-init-error=false"
|
- "--fail-on-init-error=false"
|
||||||
- "--device-list-strategy=envvar,cdi"
|
- "--device-list-strategy=envvar,cdi"
|
||||||
|
- "--config-file=/config/config.yaml"
|
||||||
securityContext:
|
securityContext:
|
||||||
privileged: true
|
privileged: true
|
||||||
env:
|
env:
|
||||||
@ -41,7 +42,12 @@ spec:
|
|||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: device-plugin
|
- name: device-plugin
|
||||||
mountPath: /var/lib/kubelet/device-plugins
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
|
- name: config
|
||||||
|
mountPath: /config
|
||||||
volumes:
|
volumes:
|
||||||
- name: device-plugin
|
- name: device-plugin
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /var/lib/kubelet/device-plugins
|
path: /var/lib/kubelet/device-plugins
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: nvidia-device-plugin-config
|
||||||
|
|||||||
@ -32,6 +32,7 @@ spec:
|
|||||||
- "--fail-on-init-error=false"
|
- "--fail-on-init-error=false"
|
||||||
- "--device-list-strategy=envvar"
|
- "--device-list-strategy=envvar"
|
||||||
- "--mig-strategy=none"
|
- "--mig-strategy=none"
|
||||||
|
- "--config-file=/config/config.yaml"
|
||||||
securityContext:
|
securityContext:
|
||||||
privileged: true
|
privileged: true
|
||||||
env:
|
env:
|
||||||
@ -42,7 +43,12 @@ spec:
|
|||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: device-plugin
|
- name: device-plugin
|
||||||
mountPath: /var/lib/kubelet/device-plugins
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
|
- name: config
|
||||||
|
mountPath: /config
|
||||||
volumes:
|
volumes:
|
||||||
- name: device-plugin
|
- name: device-plugin
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /var/lib/kubelet/device-plugins
|
path: /var/lib/kubelet/device-plugins
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: nvidia-device-plugin-config
|
||||||
|
|||||||
@ -33,6 +33,7 @@ spec:
|
|||||||
- "--fail-on-init-error=false"
|
- "--fail-on-init-error=false"
|
||||||
- "--device-list-strategy=envvar"
|
- "--device-list-strategy=envvar"
|
||||||
- "--mig-strategy=none"
|
- "--mig-strategy=none"
|
||||||
|
- "--config-file=/config/config.yaml"
|
||||||
securityContext:
|
securityContext:
|
||||||
privileged: true
|
privileged: true
|
||||||
env:
|
env:
|
||||||
@ -43,7 +44,12 @@ spec:
|
|||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: device-plugin
|
- name: device-plugin
|
||||||
mountPath: /var/lib/kubelet/device-plugins
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
|
- name: config
|
||||||
|
mountPath: /config
|
||||||
volumes:
|
volumes:
|
||||||
- name: device-plugin
|
- name: device-plugin
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /var/lib/kubelet/device-plugins
|
path: /var/lib/kubelet/device-plugins
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: nvidia-device-plugin-config
|
||||||
|
|||||||
@ -2,4 +2,5 @@
|
|||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
resources:
|
resources:
|
||||||
|
- ../components/device-plugin-config
|
||||||
- ../components/device-plugin-tethys
|
- ../components/device-plugin-tethys
|
||||||
|
|||||||
@ -171,9 +171,8 @@ def node_io_expr(scope=""):
|
|||||||
|
|
||||||
|
|
||||||
def namespace_share_expr(resource_expr):
|
def namespace_share_expr(resource_expr):
|
||||||
selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
|
total = f"clamp_min(sum( {resource_expr} ), 1)"
|
||||||
total = f"clamp_min(sum( {selected} ), 1)"
|
return f"100 * ( {resource_expr} ) / {total}"
|
||||||
return f"100 * ( {selected} ) / {total}"
|
|
||||||
|
|
||||||
|
|
||||||
def namespace_cpu_share_expr():
|
def namespace_cpu_share_expr():
|
||||||
@ -185,7 +184,10 @@ def namespace_ram_share_expr():
|
|||||||
|
|
||||||
|
|
||||||
def namespace_gpu_share_expr():
|
def namespace_gpu_share_expr():
|
||||||
return namespace_share_expr(NAMESPACE_GPU_RAW)
|
total = f"(sum({NAMESPACE_GPU_USAGE_INSTANT}) or on() vector(0))"
|
||||||
|
share = f"100 * ({NAMESPACE_GPU_USAGE_INSTANT}) / clamp_min({total}, 1)"
|
||||||
|
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
|
||||||
|
return f"({share}) or {idle}"
|
||||||
|
|
||||||
|
|
||||||
PROBLEM_PODS_EXPR = (
|
PROBLEM_PODS_EXPR = (
|
||||||
@ -270,46 +272,20 @@ STUCK_TABLE_EXPR = (
|
|||||||
")"
|
")"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
NAMESPACE_SCOPE_WORKLOAD = 'namespace!~"(^kube.*|.*-system$|^traefik$)"'
|
||||||
|
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
|
||||||
|
NAMESPACE_SCOPE_INFRA = 'namespace=~"(^kube.*|.*-system$|^traefik$)"'
|
||||||
|
NAMESPACE_SCOPE_VAR = "$namespace_scope"
|
||||||
|
NAMESPACE_SELECTOR = f'namespace!="",pod!="",container!="",{NAMESPACE_SCOPE_VAR}'
|
||||||
|
NAMESPACE_GPU_SELECTOR = f'namespace!="",pod!="",{NAMESPACE_SCOPE_VAR}'
|
||||||
|
|
||||||
NAMESPACE_CPU_RAW = (
|
NAMESPACE_CPU_RAW = (
|
||||||
'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
|
f'sum(rate(container_cpu_usage_seconds_total{{{NAMESPACE_SELECTOR}}}[5m])) by (namespace)'
|
||||||
)
|
|
||||||
NAMESPACE_RAM_RAW = (
|
|
||||||
'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
|
|
||||||
)
|
)
|
||||||
|
NAMESPACE_RAM_RAW = f'sum(container_memory_working_set_bytes{{{NAMESPACE_SELECTOR}}}) by (namespace)'
|
||||||
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
||||||
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
||||||
NAMESPACE_GPU_ALLOC = (
|
NAMESPACE_GPU_USAGE_INSTANT = f'sum(DCGM_FI_DEV_GPU_UTIL{{{NAMESPACE_GPU_SELECTOR}}}) by (namespace)'
|
||||||
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
|
|
||||||
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
|
|
||||||
)
|
|
||||||
NAMESPACE_GPU_USAGE_SHARE = (
|
|
||||||
'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))'
|
|
||||||
)
|
|
||||||
NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
|
|
||||||
NAMESPACE_GPU_RAW = (
|
|
||||||
"("
|
|
||||||
+ NAMESPACE_GPU_USAGE_SHARE
|
|
||||||
+ ") or on(namespace) ("
|
|
||||||
+ NAMESPACE_CPU_RAW
|
|
||||||
+ " * 0)"
|
|
||||||
)
|
|
||||||
NAMESPACE_GPU_WEIGHT = (
|
|
||||||
"("
|
|
||||||
+ NAMESPACE_GPU_ALLOC
|
|
||||||
+ ") or on(namespace) ("
|
|
||||||
+ NAMESPACE_CPU_RAW
|
|
||||||
+ " * 0)"
|
|
||||||
)
|
|
||||||
NAMESPACE_ACTIVITY_SCORE = (
|
|
||||||
"( "
|
|
||||||
+ NAMESPACE_CPU_RAW
|
|
||||||
+ " ) + ("
|
|
||||||
+ NAMESPACE_RAM_RAW
|
|
||||||
+ " / 1e9) + ("
|
|
||||||
+ NAMESPACE_GPU_WEIGHT
|
|
||||||
+ " * 100)"
|
|
||||||
)
|
|
||||||
NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
|
|
||||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||||
TRAEFIK_NET_INGRESS = (
|
TRAEFIK_NET_INGRESS = (
|
||||||
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
|
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
|
||||||
@ -588,6 +564,44 @@ def pie_panel(panel_id, title, expr, grid):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def namespace_scope_variable():
|
||||||
|
options = [
|
||||||
|
{
|
||||||
|
"text": "workload namespaces only",
|
||||||
|
"value": NAMESPACE_SCOPE_WORKLOAD,
|
||||||
|
"selected": True,
|
||||||
|
},
|
||||||
|
{"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False},
|
||||||
|
{
|
||||||
|
"text": "infrastructure namespaces only",
|
||||||
|
"value": NAMESPACE_SCOPE_INFRA,
|
||||||
|
"selected": False,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
query = (
|
||||||
|
"workload namespaces only : "
|
||||||
|
+ NAMESPACE_SCOPE_WORKLOAD
|
||||||
|
+ ",all namespaces : "
|
||||||
|
+ NAMESPACE_SCOPE_ALL
|
||||||
|
+ ",infrastructure namespaces only : "
|
||||||
|
+ NAMESPACE_SCOPE_INFRA
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"name": "namespace_scope",
|
||||||
|
"label": "Namespace filter",
|
||||||
|
"type": "custom",
|
||||||
|
"query": query,
|
||||||
|
"current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True},
|
||||||
|
"options": options,
|
||||||
|
"hide": 0,
|
||||||
|
"multi": False,
|
||||||
|
"includeAll": False,
|
||||||
|
"refresh": 1,
|
||||||
|
"sort": 0,
|
||||||
|
"skipUrlSync": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def bargauge_panel(
|
def bargauge_panel(
|
||||||
panel_id,
|
panel_id,
|
||||||
title,
|
title,
|
||||||
@ -1063,7 +1077,7 @@ def build_overview():
|
|||||||
"schemaVersion": 39,
|
"schemaVersion": 39,
|
||||||
"style": "dark",
|
"style": "dark",
|
||||||
"tags": ["atlas", "overview"],
|
"tags": ["atlas", "overview"],
|
||||||
"templating": {"list": []},
|
"templating": {"list": [namespace_scope_variable()]},
|
||||||
"time": {"from": "now-1h", "to": "now"},
|
"time": {"from": "now-1h", "to": "now"},
|
||||||
"refresh": "1m",
|
"refresh": "1m",
|
||||||
"links": [],
|
"links": [],
|
||||||
@ -1757,6 +1771,7 @@ def build_gpu_dashboard():
|
|||||||
"schemaVersion": 39,
|
"schemaVersion": 39,
|
||||||
"style": "dark",
|
"style": "dark",
|
||||||
"tags": ["atlas", "gpu"],
|
"tags": ["atlas", "gpu"],
|
||||||
|
"templating": {"list": [namespace_scope_variable()]},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -16,10 +16,20 @@ spec:
|
|||||||
app: ollama
|
app: ollama
|
||||||
annotations:
|
annotations:
|
||||||
ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
|
ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
|
||||||
ai.bstein.dev/gpu: RTX 3080 8GB (titan-24)
|
ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
|
||||||
spec:
|
spec:
|
||||||
nodeSelector:
|
affinity:
|
||||||
kubernetes.io/hostname: titan-24
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: kubernetes.io/hostname
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- titan-20
|
||||||
|
- titan-21
|
||||||
|
- titan-22
|
||||||
|
- titan-24
|
||||||
runtimeClassName: nvidia
|
runtimeClassName: nvidia
|
||||||
volumes:
|
volumes:
|
||||||
- name: models
|
- name: models
|
||||||
@ -55,9 +65,9 @@ spec:
|
|||||||
requests:
|
requests:
|
||||||
cpu: 250m
|
cpu: 250m
|
||||||
memory: 1Gi
|
memory: 1Gi
|
||||||
nvidia.com/gpu: 1
|
nvidia.com/gpu.shared: 1
|
||||||
limits:
|
limits:
|
||||||
nvidia.com/gpu: 1
|
nvidia.com/gpu.shared: 1
|
||||||
containers:
|
containers:
|
||||||
- name: ollama
|
- name: ollama
|
||||||
image: ollama/ollama:latest
|
image: ollama/ollama:latest
|
||||||
@ -83,8 +93,8 @@ spec:
|
|||||||
requests:
|
requests:
|
||||||
cpu: "2"
|
cpu: "2"
|
||||||
memory: 8Gi
|
memory: 8Gi
|
||||||
nvidia.com/gpu: 1
|
nvidia.com/gpu.shared: 1
|
||||||
limits:
|
limits:
|
||||||
cpu: "4"
|
cpu: "4"
|
||||||
memory: 12Gi
|
memory: 12Gi
|
||||||
nvidia.com/gpu: 1
|
nvidia.com/gpu.shared: 1
|
||||||
|
|||||||
@ -39,7 +39,7 @@ spec:
|
|||||||
fieldPath: spec.nodeName
|
fieldPath: spec.nodeName
|
||||||
- name: AI_NODE_GPU_MAP
|
- name: AI_NODE_GPU_MAP
|
||||||
value: |
|
value: |
|
||||||
{"titan-24": "RTX 3080 8GB (local GPU)", "titan-22": "RTX 3050 8GB (local GPU)"}
|
{"titan-20": "Jetson Xavier (edge GPU)", "titan-21": "Jetson Xavier (edge GPU)", "titan-22": "RTX 3050 8GB (local GPU)", "titan-24": "RTX 3080 8GB (local GPU)"}
|
||||||
ports:
|
ports:
|
||||||
- name: http
|
- name: http
|
||||||
containerPort: 8080
|
containerPort: 8080
|
||||||
|
|||||||
@ -68,8 +68,18 @@ spec:
|
|||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: config
|
- name: config
|
||||||
mountPath: /config
|
mountPath: /config
|
||||||
nodeSelector:
|
affinity:
|
||||||
jellyfin: "true"
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: kubernetes.io/hostname
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- titan-20
|
||||||
|
- titan-21
|
||||||
|
- titan-22
|
||||||
|
- titan-24
|
||||||
securityContext:
|
securityContext:
|
||||||
runAsUser: 1000
|
runAsUser: 1000
|
||||||
fsGroup: 65532
|
fsGroup: 65532
|
||||||
@ -96,11 +106,11 @@ spec:
|
|||||||
value: "002"
|
value: "002"
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
nvidia.com/gpu: 1
|
nvidia.com/gpu.shared: 1
|
||||||
# cpu: "4"
|
# cpu: "4"
|
||||||
# memory: 8Gi
|
# memory: 8Gi
|
||||||
requests:
|
requests:
|
||||||
nvidia.com/gpu: 1
|
nvidia.com/gpu.shared: 1
|
||||||
cpu: "500m"
|
cpu: "500m"
|
||||||
memory: 1Gi
|
memory: 1Gi
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
|
|||||||
@ -20,7 +20,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -71,7 +71,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
|
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -182,5 +182,43 @@
|
|||||||
"tags": [
|
"tags": [
|
||||||
"atlas",
|
"atlas",
|
||||||
"gpu"
|
"gpu"
|
||||||
]
|
],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "namespace_scope",
|
||||||
|
"label": "Namespace filter",
|
||||||
|
"type": "custom",
|
||||||
|
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"current": {
|
||||||
|
"text": "workload namespaces only",
|
||||||
|
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"selected": true
|
||||||
|
},
|
||||||
|
"options": [
|
||||||
|
{
|
||||||
|
"text": "workload namespaces only",
|
||||||
|
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"selected": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "all namespaces",
|
||||||
|
"value": "namespace=~\".*\"",
|
||||||
|
"selected": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "infrastructure namespaces only",
|
||||||
|
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"selected": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"hide": 0,
|
||||||
|
"multi": false,
|
||||||
|
"includeAll": false,
|
||||||
|
"refresh": 1,
|
||||||
|
"sort": 0,
|
||||||
|
"skipUrlSync": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1086,7 +1086,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
"expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -1137,7 +1137,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -1188,7 +1188,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
"expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -1791,7 +1791,42 @@
|
|||||||
"overview"
|
"overview"
|
||||||
],
|
],
|
||||||
"templating": {
|
"templating": {
|
||||||
"list": []
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "namespace_scope",
|
||||||
|
"label": "Namespace filter",
|
||||||
|
"type": "custom",
|
||||||
|
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"current": {
|
||||||
|
"text": "workload namespaces only",
|
||||||
|
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"selected": true
|
||||||
|
},
|
||||||
|
"options": [
|
||||||
|
{
|
||||||
|
"text": "workload namespaces only",
|
||||||
|
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"selected": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "all namespaces",
|
||||||
|
"value": "namespace=~\".*\"",
|
||||||
|
"selected": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "infrastructure namespaces only",
|
||||||
|
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"selected": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"hide": 0,
|
||||||
|
"multi": false,
|
||||||
|
"includeAll": false,
|
||||||
|
"refresh": 1,
|
||||||
|
"sort": 0,
|
||||||
|
"skipUrlSync": false
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"time": {
|
"time": {
|
||||||
"from": "now-1h",
|
"from": "now-1h",
|
||||||
|
|||||||
@ -29,7 +29,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -80,7 +80,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)",
|
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -191,5 +191,43 @@ data:
|
|||||||
"tags": [
|
"tags": [
|
||||||
"atlas",
|
"atlas",
|
||||||
"gpu"
|
"gpu"
|
||||||
]
|
],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "namespace_scope",
|
||||||
|
"label": "Namespace filter",
|
||||||
|
"type": "custom",
|
||||||
|
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"current": {
|
||||||
|
"text": "workload namespaces only",
|
||||||
|
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"selected": true
|
||||||
|
},
|
||||||
|
"options": [
|
||||||
|
{
|
||||||
|
"text": "workload namespaces only",
|
||||||
|
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"selected": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "all namespaces",
|
||||||
|
"value": "namespace=~\".*\"",
|
||||||
|
"selected": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "infrastructure namespaces only",
|
||||||
|
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"selected": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"hide": 0,
|
||||||
|
"multi": false,
|
||||||
|
"includeAll": false,
|
||||||
|
"refresh": 1,
|
||||||
|
"sort": 0,
|
||||||
|
"skipUrlSync": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1095,7 +1095,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
"expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -1146,7 +1146,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -1197,7 +1197,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
|
"expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -1800,7 +1800,42 @@ data:
|
|||||||
"overview"
|
"overview"
|
||||||
],
|
],
|
||||||
"templating": {
|
"templating": {
|
||||||
"list": []
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "namespace_scope",
|
||||||
|
"label": "Namespace filter",
|
||||||
|
"type": "custom",
|
||||||
|
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"current": {
|
||||||
|
"text": "workload namespaces only",
|
||||||
|
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"selected": true
|
||||||
|
},
|
||||||
|
"options": [
|
||||||
|
{
|
||||||
|
"text": "workload namespaces only",
|
||||||
|
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"selected": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "all namespaces",
|
||||||
|
"value": "namespace=~\".*\"",
|
||||||
|
"selected": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "infrastructure namespaces only",
|
||||||
|
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
|
||||||
|
"selected": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"hide": 0,
|
||||||
|
"multi": false,
|
||||||
|
"includeAll": false,
|
||||||
|
"refresh": 1,
|
||||||
|
"sort": 0,
|
||||||
|
"skipUrlSync": false
|
||||||
|
}
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"time": {
|
"time": {
|
||||||
"from": "now-1h",
|
"from": "now-1h",
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user