monitoring: fix namespace gpu share and network stats

This commit is contained in:
Brad Stein 2025-11-18 11:12:03 -03:00
parent d7e4bcd533
commit 7009a4f9ff
5 changed files with 28 additions and 26 deletions

View File

@ -178,7 +178,9 @@ def namespace_ram_share_expr():
def namespace_gpu_share_expr():
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
selected = (
f"( {NAMESPACE_GPU_RAW} ) + on(namespace) group_left() (0 * {NAMESPACE_COMBINED_FILTER})"
)
total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
return f"100 * ( {selected} ) / {total}"
@ -250,8 +252,8 @@ NAMESPACE_COMBINED_FILTER = (
+ " * 10))"
)
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
NET_INGRESS_EXPR = 'sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m])) or on() vector(0)'
NET_EGRESS_EXPR = 'sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m])) or on() vector(0)'
NET_INGRESS_EXPR = 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
NET_EGRESS_EXPR = 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
# ---------------------------------------------------------------------------
# Panel factories
@ -471,7 +473,7 @@ def build_overview():
thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": 0},
{"color": "red", "value": None},
{"color": "orange", "value": WORKER_TOTAL - 2},
{"color": "yellow", "value": WORKER_TOTAL - 1},
{"color": "green", "value": WORKER_TOTAL},
@ -481,7 +483,7 @@ def build_overview():
thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": 0},
{"color": "red", "value": None},
{"color": "green", "value": CONTROL_TOTAL},
],
}
@ -489,7 +491,7 @@ def build_overview():
thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": 0},
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},

View File

@ -20,7 +20,7 @@
},
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A"
}
],
@ -80,7 +80,7 @@
},
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A"
}
],

View File

@ -38,7 +38,7 @@
"steps": [
{
"color": "red",
"value": 0
"value": null
},
{
"color": "orange",
@ -107,7 +107,7 @@
"steps": [
{
"color": "red",
"value": 0
"value": null
},
{
"color": "green",
@ -168,7 +168,7 @@
"steps": [
{
"color": "green",
"value": 0
"value": null
},
{
"color": "yellow",
@ -243,7 +243,7 @@
"steps": [
{
"color": "green",
"value": 0
"value": null
},
{
"color": "yellow",
@ -318,7 +318,7 @@
"steps": [
{
"color": "green",
"value": 0
"value": null
},
{
"color": "yellow",
@ -764,7 +764,7 @@
},
"targets": [
{
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1016,7 +1016,7 @@
},
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A",
"legendFormat": "Ingress"
}
@ -1060,7 +1060,7 @@
},
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A",
"legendFormat": "Egress"
}

View File

@ -29,7 +29,7 @@ data:
},
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A"
}
],
@ -89,7 +89,7 @@ data:
},
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A"
}
],

View File

@ -47,7 +47,7 @@ data:
"steps": [
{
"color": "red",
"value": 0
"value": null
},
{
"color": "orange",
@ -116,7 +116,7 @@ data:
"steps": [
{
"color": "red",
"value": 0
"value": null
},
{
"color": "green",
@ -177,7 +177,7 @@ data:
"steps": [
{
"color": "green",
"value": 0
"value": null
},
{
"color": "yellow",
@ -252,7 +252,7 @@ data:
"steps": [
{
"color": "green",
"value": 0
"value": null
},
{
"color": "yellow",
@ -327,7 +327,7 @@ data:
"steps": [
{
"color": "green",
"value": 0
"value": null
},
{
"color": "yellow",
@ -773,7 +773,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1025,7 +1025,7 @@ data:
},
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A",
"legendFormat": "Ingress"
}
@ -1069,7 +1069,7 @@ data:
},
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A",
"legendFormat": "Egress"
}