monitoring: add gpu pie and tidy net panels
This commit is contained in:
parent
beb3243839
commit
2ba642d49f
@ -167,12 +167,20 @@ def node_io_expr(scope=""):
|
||||
|
||||
def namespace_cpu_share_expr():
|
||||
selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||
return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )"
|
||||
total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)"
|
||||
return f"100 * ( {selected} ) / {total}"
|
||||
|
||||
|
||||
def namespace_ram_share_expr():
|
||||
selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||
return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )"
|
||||
total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)"
|
||||
return f"100 * ( {selected} ) / {total}"
|
||||
|
||||
|
||||
def namespace_gpu_share_expr():
|
||||
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||
total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
|
||||
return f"100 * ( {selected} ) / {total}"
|
||||
|
||||
|
||||
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
||||
@ -214,12 +222,17 @@ NAMESPACE_CPU_RAW = (
|
||||
NAMESPACE_RAM_RAW = (
|
||||
'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
|
||||
)
|
||||
NAMESPACE_GPU_RAW = (
|
||||
'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}) by (namespace)'
|
||||
)
|
||||
NAMESPACE_COMBINED_FILTER = (
|
||||
'topk(10, ('
|
||||
+ NAMESPACE_CPU_RAW
|
||||
+ ") + ("
|
||||
+ NAMESPACE_RAM_RAW
|
||||
+ ' / 1e9))'
|
||||
+ ' / 1e9) + ( '
|
||||
+ NAMESPACE_GPU_RAW
|
||||
+ ' * 10))'
|
||||
)
|
||||
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
||||
NET_INGRESS_EXPR = (
|
||||
@ -512,22 +525,30 @@ def build_overview():
|
||||
11,
|
||||
"Namespace CPU share",
|
||||
namespace_cpu_share_expr(),
|
||||
{"h": 9, "w": 12, "x": 0, "y": 10},
|
||||
{"h": 9, "w": 8, "x": 0, "y": 10},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
pie_panel(
|
||||
12,
|
||||
"Namespace GPU share",
|
||||
namespace_gpu_share_expr(),
|
||||
{"h": 9, "w": 8, "x": 8, "y": 10},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
pie_panel(
|
||||
13,
|
||||
"Namespace RAM share",
|
||||
namespace_ram_share_expr(),
|
||||
{"h": 9, "w": 12, "x": 12, "y": 10},
|
||||
{"h": 9, "w": 8, "x": 16, "y": 10},
|
||||
)
|
||||
)
|
||||
|
||||
worker_filter = f"{WORKER_REGEX}"
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
13,
|
||||
14,
|
||||
"Worker node CPU",
|
||||
node_cpu_expr(worker_filter),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 19},
|
||||
@ -541,7 +562,7 @@ def build_overview():
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
14,
|
||||
15,
|
||||
"Worker node RAM",
|
||||
node_mem_expr(worker_filter),
|
||||
{"h": 8, "w": 12, "x": 12, "y": 19},
|
||||
@ -556,7 +577,7 @@ def build_overview():
|
||||
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
15,
|
||||
16,
|
||||
"Control plane CPU",
|
||||
node_cpu_expr(CONTROL_REGEX),
|
||||
{"h": 7, "w": 12, "x": 0, "y": 27},
|
||||
@ -568,7 +589,7 @@ def build_overview():
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
16,
|
||||
17,
|
||||
"Control plane RAM",
|
||||
node_mem_expr(CONTROL_REGEX),
|
||||
{"h": 7, "w": 12, "x": 12, "y": 27},
|
||||
@ -581,11 +602,12 @@ def build_overview():
|
||||
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
17,
|
||||
18,
|
||||
"Cluster ingress throughput",
|
||||
NET_INGRESS_EXPR,
|
||||
{"h": 7, "w": 12, "x": 0, "y": 34},
|
||||
unit="Bps",
|
||||
legend="Ingress",
|
||||
legend_display="list",
|
||||
legend_placement="bottom",
|
||||
links=link_to("atlas-network"),
|
||||
@ -593,11 +615,12 @@ def build_overview():
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
18,
|
||||
19,
|
||||
"Cluster egress throughput",
|
||||
NET_EGRESS_EXPR,
|
||||
{"h": 7, "w": 12, "x": 12, "y": 34},
|
||||
unit="Bps",
|
||||
legend="Egress",
|
||||
legend_display="list",
|
||||
legend_placement="bottom",
|
||||
links=link_to("atlas-network"),
|
||||
@ -606,7 +629,7 @@ def build_overview():
|
||||
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
19,
|
||||
20,
|
||||
"Root filesystem usage",
|
||||
root_usage_expr(),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 41},
|
||||
@ -621,12 +644,12 @@ def build_overview():
|
||||
)
|
||||
panels.append(
|
||||
{
|
||||
"id": 20,
|
||||
"id": 21,
|
||||
"type": "bargauge",
|
||||
"title": "Nodes closest to full root disks",
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 41},
|
||||
"targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A"}],
|
||||
"targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
|
||||
@ -716,13 +716,13 @@
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
|
||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -751,20 +751,20 @@
|
||||
{
|
||||
"id": 12,
|
||||
"type": "piechart",
|
||||
"title": "Namespace RAM share",
|
||||
"title": "Namespace GPU share",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 10
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
|
||||
"expr": "100 * ( ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -792,6 +792,48 @@
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "piechart",
|
||||
"title": "Namespace RAM share",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 10
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "right"
|
||||
},
|
||||
"pieType": "pie",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Worker node CPU",
|
||||
"datasource": {
|
||||
@ -838,7 +880,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"id": 15,
|
||||
"type": "timeseries",
|
||||
"title": "Worker node RAM",
|
||||
"datasource": {
|
||||
@ -885,7 +927,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "Control plane CPU",
|
||||
"datasource": {
|
||||
@ -922,7 +964,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Control plane RAM",
|
||||
"datasource": {
|
||||
@ -959,7 +1001,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Cluster ingress throughput",
|
||||
"datasource": {
|
||||
@ -975,50 +1017,8 @@
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
"title": "Open atlas-network dashboard",
|
||||
"url": "/d/atlas-network",
|
||||
"targetBlank": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Cluster egress throughput",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 34
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
|
||||
"refId": "A"
|
||||
"refId": "A",
|
||||
"legendFormat": "Ingress"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
@ -1047,6 +1047,50 @@
|
||||
{
|
||||
"id": 19,
|
||||
"type": "timeseries",
|
||||
"title": "Cluster egress throughput",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 34
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Egress"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
"title": "Open atlas-network dashboard",
|
||||
"url": "/d/atlas-network",
|
||||
"targetBlank": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"type": "timeseries",
|
||||
"title": "Root filesystem usage",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -1093,7 +1137,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"id": 21,
|
||||
"type": "bargauge",
|
||||
"title": "Nodes closest to full root disks",
|
||||
"datasource": {
|
||||
@ -1109,7 +1153,8 @@
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A"
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
|
||||
@ -725,13 +725,13 @@ data:
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
|
||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -760,20 +760,20 @@ data:
|
||||
{
|
||||
"id": 12,
|
||||
"type": "piechart",
|
||||
"title": "Namespace RAM share",
|
||||
"title": "Namespace GPU share",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 10
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )",
|
||||
"expr": "100 * ( ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -801,6 +801,48 @@ data:
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "piechart",
|
||||
"title": "Namespace RAM share",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 10
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "right"
|
||||
},
|
||||
"pieType": "pie",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Worker node CPU",
|
||||
"datasource": {
|
||||
@ -847,7 +889,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"id": 15,
|
||||
"type": "timeseries",
|
||||
"title": "Worker node RAM",
|
||||
"datasource": {
|
||||
@ -894,7 +936,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "Control plane CPU",
|
||||
"datasource": {
|
||||
@ -931,7 +973,7 @@ data:
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Control plane RAM",
|
||||
"datasource": {
|
||||
@ -968,7 +1010,7 @@ data:
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Cluster ingress throughput",
|
||||
"datasource": {
|
||||
@ -984,50 +1026,8 @@ data:
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
"title": "Open atlas-network dashboard",
|
||||
"url": "/d/atlas-network",
|
||||
"targetBlank": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Cluster egress throughput",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 34
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
|
||||
"refId": "A"
|
||||
"refId": "A",
|
||||
"legendFormat": "Ingress"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
@ -1056,6 +1056,50 @@ data:
|
||||
{
|
||||
"id": 19,
|
||||
"type": "timeseries",
|
||||
"title": "Cluster egress throughput",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 34
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)",
|
||||
"refId": "A",
|
||||
"legendFormat": "Egress"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
"title": "Open atlas-network dashboard",
|
||||
"url": "/d/atlas-network",
|
||||
"targetBlank": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"type": "timeseries",
|
||||
"title": "Root filesystem usage",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -1102,7 +1146,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"id": 21,
|
||||
"type": "bargauge",
|
||||
"title": "Nodes closest to full root disks",
|
||||
"datasource": {
|
||||
@ -1118,7 +1162,8 @@ data:
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
|
||||
"refId": "A"
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user