monitoring: rework gpu share + gauges
This commit is contained in:
parent
497164a1ad
commit
8e6c0a3cfe
@ -238,7 +238,7 @@ NAMESPACE_GPU_RAW = (
|
||||
+ NAMESPACE_GPU_ALLOC
|
||||
+ " * 0)"
|
||||
)
|
||||
NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_RAW
|
||||
NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_ALLOC
|
||||
NAMESPACE_COMBINED_FILTER = (
|
||||
'topk(10, ('
|
||||
+ NAMESPACE_CPU_RAW
|
||||
@ -319,6 +319,49 @@ def stat_panel(
|
||||
return panel
|
||||
|
||||
|
||||
def gauge_panel(
|
||||
panel_id,
|
||||
title,
|
||||
expr,
|
||||
grid,
|
||||
*,
|
||||
min_value=0,
|
||||
max_value=1,
|
||||
thresholds=None,
|
||||
links=None,
|
||||
):
|
||||
return {
|
||||
"id": panel_id,
|
||||
"type": "gauge",
|
||||
"title": title,
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": grid,
|
||||
"targets": [{"expr": expr, "refId": "A"}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": min_value,
|
||||
"max": max_value,
|
||||
"thresholds": thresholds
|
||||
or {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "red", "value": max_value},
|
||||
],
|
||||
},
|
||||
},
|
||||
"overrides": [],
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": False,
|
||||
"showThresholdLabels": False,
|
||||
},
|
||||
**({"links": links} if links else {}),
|
||||
}
|
||||
|
||||
|
||||
def timeseries_panel(
|
||||
panel_id,
|
||||
title,
|
||||
@ -472,7 +515,10 @@ def build_overview():
|
||||
]
|
||||
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
|
||||
thresholds = None
|
||||
min_value = 0
|
||||
max_value = ok_value or 5
|
||||
if panel_id == 1:
|
||||
max_value = WORKER_TOTAL
|
||||
thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -483,6 +529,7 @@ def build_overview():
|
||||
],
|
||||
}
|
||||
elif panel_id == 2:
|
||||
max_value = CONTROL_TOTAL
|
||||
thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -491,6 +538,7 @@ def build_overview():
|
||||
],
|
||||
}
|
||||
elif panel_id in (3, 4, 5):
|
||||
max_value = 4
|
||||
thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -500,13 +548,22 @@ def build_overview():
|
||||
{"color": "red", "value": 3},
|
||||
],
|
||||
}
|
||||
else:
|
||||
thresholds = {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "red", "value": max_value},
|
||||
],
|
||||
}
|
||||
panels.append(
|
||||
stat_panel(
|
||||
gauge_panel(
|
||||
panel_id,
|
||||
title,
|
||||
expr,
|
||||
{"h": 5, "w": 4, "x": 4 * idx, "y": 0},
|
||||
value_suffix=suffix,
|
||||
min_value=min_value,
|
||||
max_value=max_value,
|
||||
thresholds=thresholds,
|
||||
links=links,
|
||||
)
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"type": "gauge",
|
||||
"title": "Workers ready",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -29,10 +29,8 @@
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"max": 18,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -53,19 +51,11 @@
|
||||
"value": 18
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto",
|
||||
"valueSuffix": "/18"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
@ -73,12 +63,14 @@
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": false,
|
||||
"showThresholdLabels": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"type": "gauge",
|
||||
"title": "Control plane ready",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -98,10 +90,8 @@
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"max": 3,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -114,19 +104,11 @@
|
||||
"value": 3
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto",
|
||||
"valueSuffix": "/3"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
@ -134,12 +116,14 @@
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": false,
|
||||
"showThresholdLabels": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"type": "gauge",
|
||||
"title": "Control plane workloads",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -159,10 +143,8 @@
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"max": 4,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -183,18 +165,11 @@
|
||||
"value": 3
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
@ -202,7 +177,9 @@
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": false,
|
||||
"showThresholdLabels": false
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
@ -214,7 +191,7 @@
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"type": "gauge",
|
||||
"title": "Problem pods",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -234,10 +211,8 @@
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"max": 4,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -258,18 +233,11 @@
|
||||
"value": 3
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
@ -277,7 +245,9 @@
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": false,
|
||||
"showThresholdLabels": false
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
@ -289,7 +259,7 @@
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"type": "gauge",
|
||||
"title": "Stuck terminating",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -309,10 +279,8 @@
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"max": 4,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -333,18 +301,11 @@
|
||||
"value": 3
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
@ -352,7 +313,9 @@
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": false,
|
||||
"showThresholdLabels": false
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
@ -364,7 +327,7 @@
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"type": "gauge",
|
||||
"title": "Running pods",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -384,34 +347,25 @@
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"max": 5,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
"color": "red",
|
||||
"value": 5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
@ -419,7 +373,9 @@
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": false,
|
||||
"showThresholdLabels": false
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -722,7 +678,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
|
||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -764,7 +720,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
|
||||
"expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -806,7 +762,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
|
||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -1480,22 +1436,6 @@
|
||||
"targetBlank": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 25,
|
||||
"type": "text",
|
||||
"title": "About this dashboard",
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 55
|
||||
},
|
||||
"datasource": null,
|
||||
"options": {
|
||||
"mode": "markdown",
|
||||
"content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
|
||||
@ -18,7 +18,7 @@ data:
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"type": "gauge",
|
||||
"title": "Workers ready",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -38,10 +38,8 @@ data:
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"max": 18,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -62,19 +60,11 @@ data:
|
||||
"value": 18
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto",
|
||||
"valueSuffix": "/18"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
@ -82,12 +72,14 @@ data:
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": false,
|
||||
"showThresholdLabels": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"type": "gauge",
|
||||
"title": "Control plane ready",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -107,10 +99,8 @@ data:
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"max": 3,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -123,19 +113,11 @@ data:
|
||||
"value": 3
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto",
|
||||
"valueSuffix": "/3"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
@ -143,12 +125,14 @@ data:
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": false,
|
||||
"showThresholdLabels": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"type": "gauge",
|
||||
"title": "Control plane workloads",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -168,10 +152,8 @@ data:
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"max": 4,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -192,18 +174,11 @@ data:
|
||||
"value": 3
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
@ -211,7 +186,9 @@ data:
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": false,
|
||||
"showThresholdLabels": false
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
@ -223,7 +200,7 @@ data:
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"type": "gauge",
|
||||
"title": "Problem pods",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -243,10 +220,8 @@ data:
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"max": 4,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -267,18 +242,11 @@ data:
|
||||
"value": 3
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
@ -286,7 +254,9 @@ data:
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": false,
|
||||
"showThresholdLabels": false
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
@ -298,7 +268,7 @@ data:
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"type": "gauge",
|
||||
"title": "Stuck terminating",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -318,10 +288,8 @@ data:
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"max": 4,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
@ -342,18 +310,11 @@ data:
|
||||
"value": 3
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
@ -361,7 +322,9 @@ data:
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": false,
|
||||
"showThresholdLabels": false
|
||||
},
|
||||
"links": [
|
||||
{
|
||||
@ -373,7 +336,7 @@ data:
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"type": "gauge",
|
||||
"title": "Running pods",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@ -393,34 +356,25 @@ data:
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"min": 0,
|
||||
"max": 5,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
"color": "red",
|
||||
"value": 5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none",
|
||||
"custom": {
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
@ -428,7 +382,9 @@ data:
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"orientation": "auto",
|
||||
"showThresholdMarkers": false,
|
||||
"showThresholdLabels": false
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -731,7 +687,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
|
||||
"expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -773,7 +729,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
|
||||
"expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -815,7 +771,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
|
||||
"expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
@ -1489,22 +1445,6 @@ data:
|
||||
"targetBlank": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 25,
|
||||
"type": "text",
|
||||
"title": "About this dashboard",
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 55
|
||||
},
|
||||
"datasource": null,
|
||||
"options": {
|
||||
"mode": "markdown",
|
||||
"content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user