monitoring: resolve pie errors and network data

This commit is contained in:
Brad Stein 2025-11-18 11:30:33 -03:00
parent 7009a4f9ff
commit fab5552039
5 changed files with 27 additions and 22 deletions

View File

@ -178,9 +178,7 @@ def namespace_ram_share_expr():
def namespace_gpu_share_expr():
selected = (
f"( {NAMESPACE_GPU_RAW} ) + on(namespace) group_left() (0 * {NAMESPACE_COMBINED_FILTER})"
)
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
return f"100 * ( {selected} ) / {total}"
@ -231,13 +229,14 @@ NAMESPACE_GPU_REQUEST = (
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
)
NAMESPACE_GPU_FALLBACK = (
'sum by (namespace) (kube_pod_info{namespace!=""}'
f' and on(node) kube_node_info{{node=~"{GPU_NODE_REGEX}"}})'
'sum by (namespace) (kube_pod_info{namespace!="",node=~"'
+ GPU_NODE_REGEX
+ '"})'
)
NAMESPACE_GPU_RAW = (
"("
+ NAMESPACE_GPU_REQUEST
+ ") or on(namespace) group_left() ("
+ ") or on(namespace) ("
+ NAMESPACE_GPU_FALLBACK
+ ")"
)
@ -252,8 +251,14 @@ NAMESPACE_COMBINED_FILTER = (
+ " * 10))"
)
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
NET_INGRESS_EXPR = 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
NET_EGRESS_EXPR = 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
NET_INGRESS_EXPR = (
'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
" or on() vector(0)"
)
NET_EGRESS_EXPR = (
'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
" or on() vector(0)"
)
# ---------------------------------------------------------------------------
# Panel factories

View File

@ -20,7 +20,7 @@
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
@ -80,7 +80,7 @@
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"refId": "A"
}
],

View File

@ -722,7 +722,7 @@
},
"targets": [
{
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -764,7 +764,7 @@
},
"targets": [
{
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -806,7 +806,7 @@
},
"targets": [
{
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1016,7 +1016,7 @@
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Ingress"
}
@ -1060,7 +1060,7 @@
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Egress"
}

View File

@ -29,7 +29,7 @@ data:
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"refId": "A"
}
],
@ -89,7 +89,7 @@ data:
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"refId": "A"
}
],

View File

@ -731,7 +731,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -773,7 +773,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -815,7 +815,7 @@ data:
},
"targets": [
{
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1025,7 +1025,7 @@ data:
},
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Ingress"
}
@ -1069,7 +1069,7 @@ data:
},
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)",
"refId": "A",
"legendFormat": "Egress"
}