monitoring: control-plane stat and namespace share tweaks

This commit is contained in:
Brad Stein 2025-11-18 17:09:13 -03:00
parent f06be37f44
commit e4f93e85d2
5 changed files with 70 additions and 40 deletions

View File

@ -82,6 +82,9 @@ WORKER_SUFFIX = f"/{WORKER_TOTAL}"
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
GAUGE_WIDTHS = [5, 5, 5, 5, 4]
CONTROL_WORKLOADS_EXPR = (
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
)
# ---------------------------------------------------------------------------
# PromQL helpers
@ -168,7 +171,7 @@ def node_io_expr(scope=""):
def namespace_share_expr(resource_expr):
selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
total = f"clamp_min(sum( {resource_expr} ), 1)"
total = f"clamp_min(sum( {selected} ), 1)"
return f"100 * ( {selected} ) / {total}"
@ -282,8 +285,8 @@ NET_TOTAL_EXPR = NET_NODE_TX_PHYS
NET_INGRESS_EXPR = NET_NODE_RX_PHYS
NET_EGRESS_EXPR = NET_NODE_TX_PHYS
NET_INTERNAL_EXPR = (
'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!="",container!=""}[5m]) '
'+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!="",container!=""}[5m]))'
'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!=""}[5m]) '
'+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
' or on() vector(0)'
)
@ -529,9 +532,9 @@ def build_overview():
(
3,
"Control plane workloads",
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
CONTROL_WORKLOADS_EXPR,
None,
1,
4,
link_to("atlas-pods"),
),
(
@ -600,18 +603,31 @@ def build_overview():
],
}
width, x = gauge_grid(idx)
panels.append(
gauge_panel(
panel_id,
title,
expr,
{"h": 5, "w": width, "x": x, "y": 0},
min_value=min_value,
max_value=max_value,
thresholds=thresholds,
links=links,
if panel_id == 3:
panels.append(
stat_panel(
panel_id,
title,
expr,
{"h": 5, "w": width, "x": x, "y": 0},
thresholds=thresholds,
legend=None,
links=links,
)
)
else:
panels.append(
gauge_panel(
panel_id,
title,
expr,
{"h": 5, "w": width, "x": x, "y": 0},
min_value=min_value,
max_value=max_value,
thresholds=thresholds,
links=links,
)
)
)
hottest = [
(7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"),

View File

@ -140,7 +140,7 @@
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"refId": "A"
}
],

View File

@ -123,7 +123,7 @@
},
{
"id": 3,
"type": "gauge",
"type": "stat",
"title": "Control plane workloads",
"datasource": {
"type": "prometheus",
@ -137,14 +137,16 @@
},
"targets": [
{
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 4,
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
@ -165,11 +167,18 @@
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -177,9 +186,7 @@
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
"textMode": "value"
},
"links": [
{
@ -901,7 +908,7 @@
},
"targets": [
{
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -954,7 +961,7 @@
},
"targets": [
{
"expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ), 1)",
"expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1007,7 +1014,7 @@
},
"targets": [
{
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1316,7 +1323,7 @@
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Internal traffic"
}

View File

@ -149,7 +149,7 @@ data:
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"refId": "A"
}
],

View File

@ -132,7 +132,7 @@ data:
},
{
"id": 3,
"type": "gauge",
"type": "stat",
"title": "Control plane workloads",
"datasource": {
"type": "prometheus",
@ -146,14 +146,16 @@ data:
},
"targets": [
{
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 4,
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
@ -174,11 +176,18 @@ data:
"value": 3
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
@ -186,9 +195,7 @@ data:
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdMarkers": false,
"showThresholdLabels": false
"textMode": "value"
},
"links": [
{
@ -910,7 +917,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -963,7 +970,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ), 1)",
"expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1016,7 +1023,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1325,7 +1332,7 @@ data:
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Internal traffic"
}