monitoring: fix namespace gpu share and network stats
This commit is contained in:
parent
d7e4bcd533
commit
7009a4f9ff
@ -178,7 +178,9 @@ def namespace_ram_share_expr():
|
||||
|
||||
|
||||
def namespace_gpu_share_expr():
|
||||
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||
selected = (
|
||||
f"( {NAMESPACE_GPU_RAW} ) + on(namespace) group_left() (0 * {NAMESPACE_COMBINED_FILTER})"
|
||||
)
|
||||
total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
|
||||
return f"100 * ( {selected} ) / {total}"
|
||||
|
||||
@ -250,8 +252,8 @@ NAMESPACE_COMBINED_FILTER = (
|
||||
+ " * 10))"
|
||||
)
|
||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||
NET_INGRESS_EXPR = 'sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m])) or on() vector(0)'
|
||||
NET_EGRESS_EXPR = 'sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m])) or on() vector(0)'
|
||||
NET_INGRESS_EXPR = 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
||||
NET_EGRESS_EXPR = 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Panel factories
|
||||
@ -471,7 +473,7 @@ def build_overview():
|
||||
thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "red", "value": 0},
|
||||
{"color": "red", "value": None},
|
||||
{"color": "orange", "value": WORKER_TOTAL - 2},
|
||||
{"color": "yellow", "value": WORKER_TOTAL - 1},
|
||||
{"color": "green", "value": WORKER_TOTAL},
|
||||
@ -481,7 +483,7 @@ def build_overview():
|
||||
thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "red", "value": 0},
|
||||
{"color": "red", "value": None},
|
||||
{"color": "green", "value": CONTROL_TOTAL},
|
||||
],
|
||||
}
|
||||
@ -489,7 +491,7 @@ def build_overview():
|
||||
thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": 0},
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "orange", "value": 2},
|
||||
{"color": "red", "value": 3},
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -80,7 +80,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
|
||||
@ -38,7 +38,7 @@
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
@ -107,7 +107,7 @@
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
@ -168,7 +168,7 @@
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
@ -243,7 +243,7 @@
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
@ -318,7 +318,7 @@
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
@ -764,7 +764,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
|
||||
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -1016,7 +1016,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Ingress"
|
||||
}
|
||||
@ -1060,7 +1060,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Egress"
|
||||
}
|
||||
|
||||
@ -29,7 +29,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -89,7 +89,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
|
||||
@ -47,7 +47,7 @@ data:
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
@ -116,7 +116,7 @@ data:
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
@ -177,7 +177,7 @@ data:
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
@ -252,7 +252,7 @@ data:
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
@ -327,7 +327,7 @@ data:
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": 0
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
@ -773,7 +773,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
|
||||
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -1025,7 +1025,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Ingress"
|
||||
}
|
||||
@ -1069,7 +1069,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||
"refId": "A",
|
||||
"legendFormat": "Egress"
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user