feature/atlas-monitoring #3
@ -167,17 +167,20 @@ def node_io_expr(scope=""):
|
|||||||
|
|
||||||
def namespace_cpu_share_expr():
|
def namespace_cpu_share_expr():
|
||||||
selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||||
return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )"
|
total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)"
|
||||||
|
return f"100 * ( {selected} ) / {total}"
|
||||||
|
|
||||||
|
|
||||||
def namespace_ram_share_expr():
|
def namespace_ram_share_expr():
|
||||||
selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||||
return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )"
|
total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)"
|
||||||
|
return f"100 * ( {selected} ) / {total}"
|
||||||
|
|
||||||
|
|
||||||
def namespace_gpu_share_expr():
|
def namespace_gpu_share_expr():
|
||||||
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||||
return f"100 * ( {selected} ) / sum( {NAMESPACE_GPU_RAW} )"
|
total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
|
||||||
|
return f"100 * ( {selected} ) / {total}"
|
||||||
|
|
||||||
|
|
||||||
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
||||||
@ -222,12 +225,17 @@ NAMESPACE_RAM_RAW = (
|
|||||||
NAMESPACE_GPU_RAW = (
|
NAMESPACE_GPU_RAW = (
|
||||||
'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
|
'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
|
||||||
)
|
)
|
||||||
|
NAMESPACE_GPU_RAW = (
|
||||||
|
'sum(kube_pod_container_resource_requests{resource="nvidia.com/gpu",namespace!=""}) by (namespace)'
|
||||||
|
)
|
||||||
NAMESPACE_COMBINED_FILTER = (
|
NAMESPACE_COMBINED_FILTER = (
|
||||||
'topk(10, ('
|
'topk(10, ('
|
||||||
+ NAMESPACE_CPU_RAW
|
+ NAMESPACE_CPU_RAW
|
||||||
+ ") + ("
|
+ ") + ("
|
||||||
+ NAMESPACE_RAM_RAW
|
+ NAMESPACE_RAM_RAW
|
||||||
+ ' / 1e9))'
|
+ ' / 1e9) + ('
|
||||||
|
+ NAMESPACE_GPU_RAW
|
||||||
|
+ ' * 10))'
|
||||||
)
|
)
|
||||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||||
NET_INGRESS_EXPR = (
|
NET_INGRESS_EXPR = (
|
||||||
@ -518,24 +526,24 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
11,
|
11,
|
||||||
"Namespace CPU share",
|
"Namespace GPU share",
|
||||||
namespace_cpu_share_expr(),
|
namespace_gpu_share_expr(),
|
||||||
{"h": 9, "w": 8, "x": 0, "y": 10},
|
{"h": 9, "w": 8, "x": 0, "y": 10},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
12,
|
12,
|
||||||
"Namespace RAM share",
|
"Namespace CPU share",
|
||||||
namespace_ram_share_expr(),
|
namespace_cpu_share_expr(),
|
||||||
{"h": 9, "w": 8, "x": 8, "y": 10},
|
{"h": 9, "w": 8, "x": 8, "y": 10},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
13,
|
13,
|
||||||
"Namespace GPU share",
|
"Namespace RAM share",
|
||||||
namespace_gpu_share_expr(),
|
namespace_ram_share_expr(),
|
||||||
{"h": 9, "w": 8, "x": 16, "y": 10},
|
{"h": 9, "w": 8, "x": 16, "y": 10},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@ -709,7 +709,7 @@
|
|||||||
{
|
{
|
||||||
"id": 11,
|
"id": 11,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace CPU share",
|
"title": "Namespace GPU share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -722,7 +722,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
|
"expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -751,7 +751,7 @@
|
|||||||
{
|
{
|
||||||
"id": 12,
|
"id": 12,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace RAM share",
|
"title": "Namespace CPU share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -764,7 +764,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
|
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -793,7 +793,7 @@
|
|||||||
{
|
{
|
||||||
"id": 13,
|
"id": 13,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace GPU share",
|
"title": "Namespace RAM share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -806,7 +806,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
|
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -718,7 +718,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 11,
|
"id": 11,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace CPU share",
|
"title": "Namespace GPU share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -731,7 +731,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
|
"expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -760,7 +760,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 12,
|
"id": 12,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace RAM share",
|
"title": "Namespace CPU share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -773,7 +773,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
|
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -802,7 +802,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 13,
|
"id": 13,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace GPU share",
|
"title": "Namespace RAM share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -815,7 +815,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
|
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user