feature/atlas-monitoring #3
@ -178,7 +178,9 @@ def namespace_ram_share_expr():
|
|||||||
|
|
||||||
|
|
||||||
def namespace_gpu_share_expr():
|
def namespace_gpu_share_expr():
|
||||||
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
selected = (
|
||||||
|
f"( {NAMESPACE_GPU_RAW} ) + on(namespace) group_left() (0 * {NAMESPACE_COMBINED_FILTER})"
|
||||||
|
)
|
||||||
total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
|
total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
|
||||||
return f"100 * ( {selected} ) / {total}"
|
return f"100 * ( {selected} ) / {total}"
|
||||||
|
|
||||||
@ -250,8 +252,8 @@ NAMESPACE_COMBINED_FILTER = (
|
|||||||
+ " * 10))"
|
+ " * 10))"
|
||||||
)
|
)
|
||||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||||
NET_INGRESS_EXPR = 'sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m])) or on() vector(0)'
|
NET_INGRESS_EXPR = 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
||||||
NET_EGRESS_EXPR = 'sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m])) or on() vector(0)'
|
NET_EGRESS_EXPR = 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Panel factories
|
# Panel factories
|
||||||
@ -471,7 +473,7 @@ def build_overview():
|
|||||||
thresholds = {
|
thresholds = {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
{"color": "red", "value": 0},
|
{"color": "red", "value": None},
|
||||||
{"color": "orange", "value": WORKER_TOTAL - 2},
|
{"color": "orange", "value": WORKER_TOTAL - 2},
|
||||||
{"color": "yellow", "value": WORKER_TOTAL - 1},
|
{"color": "yellow", "value": WORKER_TOTAL - 1},
|
||||||
{"color": "green", "value": WORKER_TOTAL},
|
{"color": "green", "value": WORKER_TOTAL},
|
||||||
@ -481,7 +483,7 @@ def build_overview():
|
|||||||
thresholds = {
|
thresholds = {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
{"color": "red", "value": 0},
|
{"color": "red", "value": None},
|
||||||
{"color": "green", "value": CONTROL_TOTAL},
|
{"color": "green", "value": CONTROL_TOTAL},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
@ -489,7 +491,7 @@ def build_overview():
|
|||||||
thresholds = {
|
thresholds = {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
{"color": "green", "value": 0},
|
{"color": "green", "value": None},
|
||||||
{"color": "yellow", "value": 1},
|
{"color": "yellow", "value": 1},
|
||||||
{"color": "orange", "value": 2},
|
{"color": "orange", "value": 2},
|
||||||
{"color": "red", "value": 3},
|
{"color": "red", "value": 3},
|
||||||
|
|||||||
@ -20,7 +20,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -80,7 +80,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -38,7 +38,7 @@
|
|||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "red",
|
"color": "red",
|
||||||
"value": 0
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "orange",
|
"color": "orange",
|
||||||
@ -107,7 +107,7 @@
|
|||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "red",
|
"color": "red",
|
||||||
"value": 0
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "green",
|
"color": "green",
|
||||||
@ -168,7 +168,7 @@
|
|||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "green",
|
"color": "green",
|
||||||
"value": 0
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "yellow",
|
"color": "yellow",
|
||||||
@ -243,7 +243,7 @@
|
|||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "green",
|
"color": "green",
|
||||||
"value": 0
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "yellow",
|
"color": "yellow",
|
||||||
@ -318,7 +318,7 @@
|
|||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "green",
|
"color": "green",
|
||||||
"value": 0
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "yellow",
|
"color": "yellow",
|
||||||
@ -764,7 +764,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
|
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -1016,7 +1016,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "Ingress"
|
"legendFormat": "Ingress"
|
||||||
}
|
}
|
||||||
@ -1060,7 +1060,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "Egress"
|
"legendFormat": "Egress"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,7 +29,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -89,7 +89,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@ -47,7 +47,7 @@ data:
|
|||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "red",
|
"color": "red",
|
||||||
"value": 0
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "orange",
|
"color": "orange",
|
||||||
@ -116,7 +116,7 @@ data:
|
|||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "red",
|
"color": "red",
|
||||||
"value": 0
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "green",
|
"color": "green",
|
||||||
@ -177,7 +177,7 @@ data:
|
|||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "green",
|
"color": "green",
|
||||||
"value": 0
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "yellow",
|
"color": "yellow",
|
||||||
@ -252,7 +252,7 @@ data:
|
|||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "green",
|
"color": "green",
|
||||||
"value": 0
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "yellow",
|
"color": "yellow",
|
||||||
@ -327,7 +327,7 @@ data:
|
|||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "green",
|
"color": "green",
|
||||||
"value": 0
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "yellow",
|
"color": "yellow",
|
||||||
@ -773,7 +773,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
|
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -1025,7 +1025,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "Ingress"
|
"legendFormat": "Ingress"
|
||||||
}
|
}
|
||||||
@ -1069,7 +1069,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)",
|
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "Egress"
|
"legendFormat": "Egress"
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user