monitoring: add namespace gpu share
This commit is contained in:
parent
c53c518301
commit
0708522b28
@ -175,6 +175,11 @@ def namespace_ram_share_expr():
|
||||
return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )"
|
||||
|
||||
|
||||
def namespace_gpu_share_expr():
|
||||
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )"
|
||||
return f"100 * ( {selected} ) / sum( {NAMESPACE_GPU_RAW} )"
|
||||
|
||||
|
||||
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
||||
CRASHLOOP_EXPR = (
|
||||
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
|
||||
@ -214,6 +219,9 @@ NAMESPACE_CPU_RAW = (
|
||||
NAMESPACE_RAM_RAW = (
|
||||
'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
|
||||
)
|
||||
NAMESPACE_GPU_RAW = (
|
||||
'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
|
||||
)
|
||||
NAMESPACE_COMBINED_FILTER = (
|
||||
'topk(10, ('
|
||||
+ NAMESPACE_CPU_RAW
|
||||
@ -512,7 +520,7 @@ def build_overview():
|
||||
11,
|
||||
"Namespace CPU share",
|
||||
namespace_cpu_share_expr(),
|
||||
{"h": 9, "w": 12, "x": 0, "y": 10},
|
||||
{"h": 9, "w": 8, "x": 0, "y": 10},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
@ -520,14 +528,22 @@ def build_overview():
|
||||
12,
|
||||
"Namespace RAM share",
|
||||
namespace_ram_share_expr(),
|
||||
{"h": 9, "w": 12, "x": 12, "y": 10},
|
||||
{"h": 9, "w": 8, "x": 8, "y": 10},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
pie_panel(
|
||||
13,
|
||||
"Namespace GPU share",
|
||||
namespace_gpu_share_expr(),
|
||||
{"h": 9, "w": 8, "x": 16, "y": 10},
|
||||
)
|
||||
)
|
||||
|
||||
worker_filter = f"{WORKER_REGEX}"
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
13,
|
||||
14,
|
||||
"Worker node CPU",
|
||||
node_cpu_expr(worker_filter),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 19},
|
||||
@ -541,7 +557,7 @@ def build_overview():
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
14,
|
||||
15,
|
||||
"Worker node RAM",
|
||||
node_mem_expr(worker_filter),
|
||||
{"h": 8, "w": 12, "x": 12, "y": 19},
|
||||
@ -556,7 +572,7 @@ def build_overview():
|
||||
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
15,
|
||||
16,
|
||||
"Control plane CPU",
|
||||
node_cpu_expr(CONTROL_REGEX),
|
||||
{"h": 7, "w": 12, "x": 0, "y": 27},
|
||||
@ -568,7 +584,7 @@ def build_overview():
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
16,
|
||||
17,
|
||||
"Control plane RAM",
|
||||
node_mem_expr(CONTROL_REGEX),
|
||||
{"h": 7, "w": 12, "x": 12, "y": 27},
|
||||
@ -581,7 +597,7 @@ def build_overview():
|
||||
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
17,
|
||||
18,
|
||||
"Cluster ingress throughput",
|
||||
NET_INGRESS_EXPR,
|
||||
{"h": 7, "w": 12, "x": 0, "y": 34},
|
||||
@ -593,7 +609,7 @@ def build_overview():
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
18,
|
||||
19,
|
||||
"Cluster egress throughput",
|
||||
NET_EGRESS_EXPR,
|
||||
{"h": 7, "w": 12, "x": 12, "y": 34},
|
||||
@ -606,7 +622,7 @@ def build_overview():
|
||||
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
19,
|
||||
20,
|
||||
"Root filesystem usage",
|
||||
root_usage_expr(),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 41},
|
||||
@ -621,7 +637,7 @@ def build_overview():
|
||||
)
|
||||
panels.append(
|
||||
{
|
||||
"id": 20,
|
||||
"id": 21,
|
||||
"type": "bargauge",
|
||||
"title": "Nodes closest to full root disks",
|
||||
"datasource": PROM_DS,
|
||||
@ -655,10 +671,10 @@ def build_overview():
|
||||
)
|
||||
|
||||
storage_panels = [
|
||||
(21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
||||
(22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
||||
(23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
||||
(24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
||||
(22, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
||||
(23, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
||||
(24, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
||||
(25, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
||||
]
|
||||
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
|
||||
panels.append(
|
||||
@ -675,7 +691,7 @@ def build_overview():
|
||||
|
||||
panels.append(
|
||||
text_panel(
|
||||
25,
|
||||
26,
|
||||
"About this dashboard",
|
||||
textwrap.dedent(
|
||||
"""\
|
||||
|
||||
@ -716,7 +716,7 @@
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
@ -758,8 +758,8 @@
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 10
|
||||
},
|
||||
"targets": [
|
||||
@ -792,6 +792,48 @@
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "piechart",
|
||||
"title": "Namespace GPU share",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 10
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "right"
|
||||
},
|
||||
"pieType": "pie",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Worker node CPU",
|
||||
"datasource": {
|
||||
@ -838,7 +880,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"id": 15,
|
||||
"type": "timeseries",
|
||||
"title": "Worker node RAM",
|
||||
"datasource": {
|
||||
@ -885,7 +927,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "Control plane CPU",
|
||||
"datasource": {
|
||||
@ -922,7 +964,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Control plane RAM",
|
||||
"datasource": {
|
||||
@ -959,7 +1001,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Cluster ingress throughput",
|
||||
"datasource": {
|
||||
@ -1002,7 +1044,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"id": 19,
|
||||
"type": "timeseries",
|
||||
"title": "Cluster egress throughput",
|
||||
"datasource": {
|
||||
@ -1045,7 +1087,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"id": 20,
|
||||
"type": "timeseries",
|
||||
"title": "Root filesystem usage",
|
||||
"datasource": {
|
||||
@ -1093,7 +1135,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"id": 21,
|
||||
"type": "bargauge",
|
||||
"title": "Nodes closest to full root disks",
|
||||
"datasource": {
|
||||
@ -1162,7 +1204,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"id": 22,
|
||||
"type": "stat",
|
||||
"title": "Astreae usage",
|
||||
"datasource": {
|
||||
@ -1233,7 +1275,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"id": 23,
|
||||
"type": "stat",
|
||||
"title": "Asteria usage",
|
||||
"datasource": {
|
||||
@ -1304,7 +1346,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"id": 24,
|
||||
"type": "stat",
|
||||
"title": "Astreae free",
|
||||
"datasource": {
|
||||
@ -1371,7 +1413,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 24,
|
||||
"id": 25,
|
||||
"type": "stat",
|
||||
"title": "Asteria free",
|
||||
"datasource": {
|
||||
@ -1438,7 +1480,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 25,
|
||||
"id": 26,
|
||||
"type": "text",
|
||||
"title": "About this dashboard",
|
||||
"gridPos": {
|
||||
|
||||
@ -725,7 +725,7 @@ data:
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
@ -767,8 +767,8 @@ data:
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 10
|
||||
},
|
||||
"targets": [
|
||||
@ -801,6 +801,48 @@ data:
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "piechart",
|
||||
"title": "Namespace GPU share",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 10
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "right"
|
||||
},
|
||||
"pieType": "pie",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Worker node CPU",
|
||||
"datasource": {
|
||||
@ -847,7 +889,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"id": 15,
|
||||
"type": "timeseries",
|
||||
"title": "Worker node RAM",
|
||||
"datasource": {
|
||||
@ -894,7 +936,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"id": 16,
|
||||
"type": "timeseries",
|
||||
"title": "Control plane CPU",
|
||||
"datasource": {
|
||||
@ -931,7 +973,7 @@ data:
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"id": 17,
|
||||
"type": "timeseries",
|
||||
"title": "Control plane RAM",
|
||||
"datasource": {
|
||||
@ -968,7 +1010,7 @@ data:
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"id": 18,
|
||||
"type": "timeseries",
|
||||
"title": "Cluster ingress throughput",
|
||||
"datasource": {
|
||||
@ -1011,7 +1053,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"id": 19,
|
||||
"type": "timeseries",
|
||||
"title": "Cluster egress throughput",
|
||||
"datasource": {
|
||||
@ -1054,7 +1096,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"id": 20,
|
||||
"type": "timeseries",
|
||||
"title": "Root filesystem usage",
|
||||
"datasource": {
|
||||
@ -1102,7 +1144,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"id": 21,
|
||||
"type": "bargauge",
|
||||
"title": "Nodes closest to full root disks",
|
||||
"datasource": {
|
||||
@ -1171,7 +1213,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"id": 22,
|
||||
"type": "stat",
|
||||
"title": "Astreae usage",
|
||||
"datasource": {
|
||||
@ -1242,7 +1284,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"id": 23,
|
||||
"type": "stat",
|
||||
"title": "Asteria usage",
|
||||
"datasource": {
|
||||
@ -1313,7 +1355,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"id": 24,
|
||||
"type": "stat",
|
||||
"title": "Astreae free",
|
||||
"datasource": {
|
||||
@ -1380,7 +1422,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 24,
|
||||
"id": 25,
|
||||
"type": "stat",
|
||||
"title": "Asteria free",
|
||||
"datasource": {
|
||||
@ -1447,7 +1489,7 @@ data:
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 25,
|
||||
"id": 26,
|
||||
"type": "text",
|
||||
"title": "About this dashboard",
|
||||
"gridPos": {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user