Revert GPU pie chart additions
This commit is contained in:
parent
aef3176c1c
commit
beb3243839
@ -145,7 +145,7 @@ def astreae_free_expr(mount):
|
|||||||
|
|
||||||
|
|
||||||
def topk_with_node(expr):
|
def topk_with_node(expr):
|
||||||
return f"topk(1, {expr})"
|
return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")'
|
||||||
|
|
||||||
|
|
||||||
def node_net_expr(scope=""):
|
def node_net_expr(scope=""):
|
||||||
@ -167,20 +167,12 @@ def node_io_expr(scope=""):
|
|||||||
|
|
||||||
def namespace_cpu_share_expr():
|
def namespace_cpu_share_expr():
|
||||||
selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||||
total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)"
|
return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )"
|
||||||
return f"100 * ( {selected} ) / {total}"
|
|
||||||
|
|
||||||
|
|
||||||
def namespace_ram_share_expr():
|
def namespace_ram_share_expr():
|
||||||
selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||||
total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)"
|
return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )"
|
||||||
return f"100 * ( {selected} ) / {total}"
|
|
||||||
|
|
||||||
|
|
||||||
def namespace_gpu_share_expr():
|
|
||||||
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
|
||||||
total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
|
|
||||||
return f"100 * ( {selected} ) / {total}"
|
|
||||||
|
|
||||||
|
|
||||||
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
||||||
@ -222,20 +214,12 @@ NAMESPACE_CPU_RAW = (
|
|||||||
NAMESPACE_RAM_RAW = (
|
NAMESPACE_RAM_RAW = (
|
||||||
'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
|
'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
|
||||||
)
|
)
|
||||||
NAMESPACE_GPU_RAW = (
|
|
||||||
'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
|
|
||||||
)
|
|
||||||
NAMESPACE_GPU_RAW = (
|
|
||||||
'sum(kube_pod_container_resource_requests{resource="nvidia.com/gpu",namespace!=""}) by (namespace)'
|
|
||||||
)
|
|
||||||
NAMESPACE_COMBINED_FILTER = (
|
NAMESPACE_COMBINED_FILTER = (
|
||||||
'topk(10, ('
|
'topk(10, ('
|
||||||
+ NAMESPACE_CPU_RAW
|
+ NAMESPACE_CPU_RAW
|
||||||
+ ") + ("
|
+ ") + ("
|
||||||
+ NAMESPACE_RAM_RAW
|
+ NAMESPACE_RAM_RAW
|
||||||
+ ' / 1e9) + ('
|
+ ' / 1e9))'
|
||||||
+ NAMESPACE_GPU_RAW
|
|
||||||
+ ' * 10))'
|
|
||||||
)
|
)
|
||||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||||
NET_INGRESS_EXPR = (
|
NET_INGRESS_EXPR = (
|
||||||
@ -526,32 +510,24 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
11,
|
11,
|
||||||
"Namespace GPU share",
|
"Namespace CPU share",
|
||||||
namespace_gpu_share_expr(),
|
namespace_cpu_share_expr(),
|
||||||
{"h": 9, "w": 8, "x": 0, "y": 10},
|
{"h": 9, "w": 12, "x": 0, "y": 10},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
pie_panel(
|
pie_panel(
|
||||||
12,
|
12,
|
||||||
"Namespace CPU share",
|
|
||||||
namespace_cpu_share_expr(),
|
|
||||||
{"h": 9, "w": 8, "x": 8, "y": 10},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
panels.append(
|
|
||||||
pie_panel(
|
|
||||||
13,
|
|
||||||
"Namespace RAM share",
|
"Namespace RAM share",
|
||||||
namespace_ram_share_expr(),
|
namespace_ram_share_expr(),
|
||||||
{"h": 9, "w": 8, "x": 16, "y": 10},
|
{"h": 9, "w": 12, "x": 12, "y": 10},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
worker_filter = f"{WORKER_REGEX}"
|
worker_filter = f"{WORKER_REGEX}"
|
||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
14,
|
13,
|
||||||
"Worker node CPU",
|
"Worker node CPU",
|
||||||
node_cpu_expr(worker_filter),
|
node_cpu_expr(worker_filter),
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 19},
|
{"h": 8, "w": 12, "x": 0, "y": 19},
|
||||||
@ -565,7 +541,7 @@ def build_overview():
|
|||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
15,
|
14,
|
||||||
"Worker node RAM",
|
"Worker node RAM",
|
||||||
node_mem_expr(worker_filter),
|
node_mem_expr(worker_filter),
|
||||||
{"h": 8, "w": 12, "x": 12, "y": 19},
|
{"h": 8, "w": 12, "x": 12, "y": 19},
|
||||||
@ -580,7 +556,7 @@ def build_overview():
|
|||||||
|
|
||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
16,
|
15,
|
||||||
"Control plane CPU",
|
"Control plane CPU",
|
||||||
node_cpu_expr(CONTROL_REGEX),
|
node_cpu_expr(CONTROL_REGEX),
|
||||||
{"h": 7, "w": 12, "x": 0, "y": 27},
|
{"h": 7, "w": 12, "x": 0, "y": 27},
|
||||||
@ -592,7 +568,7 @@ def build_overview():
|
|||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
17,
|
16,
|
||||||
"Control plane RAM",
|
"Control plane RAM",
|
||||||
node_mem_expr(CONTROL_REGEX),
|
node_mem_expr(CONTROL_REGEX),
|
||||||
{"h": 7, "w": 12, "x": 12, "y": 27},
|
{"h": 7, "w": 12, "x": 12, "y": 27},
|
||||||
@ -605,7 +581,7 @@ def build_overview():
|
|||||||
|
|
||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
18,
|
17,
|
||||||
"Cluster ingress throughput",
|
"Cluster ingress throughput",
|
||||||
NET_INGRESS_EXPR,
|
NET_INGRESS_EXPR,
|
||||||
{"h": 7, "w": 12, "x": 0, "y": 34},
|
{"h": 7, "w": 12, "x": 0, "y": 34},
|
||||||
@ -617,7 +593,7 @@ def build_overview():
|
|||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
19,
|
18,
|
||||||
"Cluster egress throughput",
|
"Cluster egress throughput",
|
||||||
NET_EGRESS_EXPR,
|
NET_EGRESS_EXPR,
|
||||||
{"h": 7, "w": 12, "x": 12, "y": 34},
|
{"h": 7, "w": 12, "x": 12, "y": 34},
|
||||||
@ -630,7 +606,7 @@ def build_overview():
|
|||||||
|
|
||||||
panels.append(
|
panels.append(
|
||||||
timeseries_panel(
|
timeseries_panel(
|
||||||
20,
|
19,
|
||||||
"Root filesystem usage",
|
"Root filesystem usage",
|
||||||
root_usage_expr(),
|
root_usage_expr(),
|
||||||
{"h": 8, "w": 12, "x": 0, "y": 41},
|
{"h": 8, "w": 12, "x": 0, "y": 41},
|
||||||
@ -645,7 +621,7 @@ def build_overview():
|
|||||||
)
|
)
|
||||||
panels.append(
|
panels.append(
|
||||||
{
|
{
|
||||||
"id": 21,
|
"id": 20,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Nodes closest to full root disks",
|
"title": "Nodes closest to full root disks",
|
||||||
"datasource": PROM_DS,
|
"datasource": PROM_DS,
|
||||||
@ -679,10 +655,10 @@ def build_overview():
|
|||||||
)
|
)
|
||||||
|
|
||||||
storage_panels = [
|
storage_panels = [
|
||||||
(22, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
(21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
||||||
(23, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
(22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
||||||
(24, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
(23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
||||||
(25, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
(24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
||||||
]
|
]
|
||||||
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
|
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -699,7 +675,7 @@ def build_overview():
|
|||||||
|
|
||||||
panels.append(
|
panels.append(
|
||||||
text_panel(
|
text_panel(
|
||||||
26,
|
25,
|
||||||
"About this dashboard",
|
"About this dashboard",
|
||||||
textwrap.dedent(
|
textwrap.dedent(
|
||||||
"""\
|
"""\
|
||||||
|
|||||||
@ -438,7 +438,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
"expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -511,7 +511,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
"expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -584,7 +584,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
"expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -653,7 +653,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
"expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -709,20 +709,20 @@
|
|||||||
{
|
{
|
||||||
"id": 11,
|
"id": 11,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace GPU share",
|
"title": "Namespace CPU share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
},
|
},
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 8,
|
"w": 12,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 10
|
"y": 10
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)",
|
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -751,20 +751,20 @@
|
|||||||
{
|
{
|
||||||
"id": 12,
|
"id": 12,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace CPU share",
|
"title": "Namespace RAM share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
},
|
},
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 8,
|
"w": 12,
|
||||||
"x": 8,
|
"x": 12,
|
||||||
"y": 10
|
"y": 10
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
|
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -792,48 +792,6 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 13,
|
"id": 13,
|
||||||
"type": "piechart",
|
|
||||||
"title": "Namespace RAM share",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 9,
|
|
||||||
"w": 8,
|
|
||||||
"x": 16,
|
|
||||||
"y": 10
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
|
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": "{{namespace}}"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "percent"
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"legend": {
|
|
||||||
"displayMode": "list",
|
|
||||||
"placement": "right"
|
|
||||||
},
|
|
||||||
"pieType": "pie",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 14,
|
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Worker node CPU",
|
"title": "Worker node CPU",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -880,7 +838,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 15,
|
"id": 14,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Worker node RAM",
|
"title": "Worker node RAM",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -927,7 +885,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 16,
|
"id": 15,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Control plane CPU",
|
"title": "Control plane CPU",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -964,7 +922,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 17,
|
"id": 16,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Control plane RAM",
|
"title": "Control plane RAM",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1001,7 +959,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 18,
|
"id": 17,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Cluster ingress throughput",
|
"title": "Cluster ingress throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1044,7 +1002,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 19,
|
"id": 18,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Cluster egress throughput",
|
"title": "Cluster egress throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1087,7 +1045,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 20,
|
"id": 19,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Root filesystem usage",
|
"title": "Root filesystem usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1135,7 +1093,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 21,
|
"id": 20,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Nodes closest to full root disks",
|
"title": "Nodes closest to full root disks",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1204,7 +1162,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 22,
|
"id": 21,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Astreae usage",
|
"title": "Astreae usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1275,7 +1233,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 23,
|
"id": 22,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Asteria usage",
|
"title": "Asteria usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1346,7 +1304,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 24,
|
"id": 23,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Astreae free",
|
"title": "Astreae free",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1413,7 +1371,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 25,
|
"id": 24,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Asteria free",
|
"title": "Asteria free",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1480,7 +1438,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 26,
|
"id": 25,
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"title": "About this dashboard",
|
"title": "About this dashboard",
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
|
|||||||
@ -447,7 +447,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
"expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -520,7 +520,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
"expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -593,7 +593,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
"expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -662,7 +662,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
"expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}",
|
"legendFormat": "{{node}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -718,20 +718,20 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 11,
|
"id": 11,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace GPU share",
|
"title": "Namespace CPU share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
},
|
},
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 8,
|
"w": 12,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 10
|
"y": 10
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)",
|
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -760,20 +760,20 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 12,
|
"id": 12,
|
||||||
"type": "piechart",
|
"type": "piechart",
|
||||||
"title": "Namespace CPU share",
|
"title": "Namespace RAM share",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
},
|
},
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 8,
|
"w": 12,
|
||||||
"x": 8,
|
"x": 12,
|
||||||
"y": 10
|
"y": 10
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
|
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{namespace}}"
|
"legendFormat": "{{namespace}}"
|
||||||
}
|
}
|
||||||
@ -801,48 +801,6 @@ data:
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 13,
|
"id": 13,
|
||||||
"type": "piechart",
|
|
||||||
"title": "Namespace RAM share",
|
|
||||||
"datasource": {
|
|
||||||
"type": "prometheus",
|
|
||||||
"uid": "atlas-vm"
|
|
||||||
},
|
|
||||||
"gridPos": {
|
|
||||||
"h": 9,
|
|
||||||
"w": 8,
|
|
||||||
"x": 16,
|
|
||||||
"y": 10
|
|
||||||
},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
|
|
||||||
"refId": "A",
|
|
||||||
"legendFormat": "{{namespace}}"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "percent"
|
|
||||||
},
|
|
||||||
"overrides": []
|
|
||||||
},
|
|
||||||
"options": {
|
|
||||||
"legend": {
|
|
||||||
"displayMode": "list",
|
|
||||||
"placement": "right"
|
|
||||||
},
|
|
||||||
"pieType": "pie",
|
|
||||||
"reduceOptions": {
|
|
||||||
"calcs": [
|
|
||||||
"lastNotNull"
|
|
||||||
],
|
|
||||||
"fields": "",
|
|
||||||
"values": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 14,
|
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Worker node CPU",
|
"title": "Worker node CPU",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -889,7 +847,7 @@ data:
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 15,
|
"id": 14,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Worker node RAM",
|
"title": "Worker node RAM",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -936,7 +894,7 @@ data:
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 16,
|
"id": 15,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Control plane CPU",
|
"title": "Control plane CPU",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -973,7 +931,7 @@ data:
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 17,
|
"id": 16,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Control plane RAM",
|
"title": "Control plane RAM",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1010,7 +968,7 @@ data:
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 18,
|
"id": 17,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Cluster ingress throughput",
|
"title": "Cluster ingress throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1053,7 +1011,7 @@ data:
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 19,
|
"id": 18,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Cluster egress throughput",
|
"title": "Cluster egress throughput",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1096,7 +1054,7 @@ data:
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 20,
|
"id": 19,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Root filesystem usage",
|
"title": "Root filesystem usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1144,7 +1102,7 @@ data:
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 21,
|
"id": 20,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Nodes closest to full root disks",
|
"title": "Nodes closest to full root disks",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1213,7 +1171,7 @@ data:
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 22,
|
"id": 21,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Astreae usage",
|
"title": "Astreae usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1284,7 +1242,7 @@ data:
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 23,
|
"id": 22,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Asteria usage",
|
"title": "Asteria usage",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1355,7 +1313,7 @@ data:
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 24,
|
"id": 23,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Astreae free",
|
"title": "Astreae free",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1422,7 +1380,7 @@ data:
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 25,
|
"id": 24,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Asteria free",
|
"title": "Asteria free",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
@ -1489,7 +1447,7 @@ data:
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 26,
|
"id": 25,
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"title": "About this dashboard",
|
"title": "About this dashboard",
|
||||||
"gridPos": {
|
"gridPos": {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user