atlas dashboards: show pod counts (not %) and make zero-friendly stats
This commit is contained in:
parent
b770575b42
commit
056b7b7770
@ -187,16 +187,21 @@ def namespace_gpu_share_expr():
|
||||
return namespace_share_expr(NAMESPACE_GPU_RAW)
|
||||
|
||||
|
||||
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
||||
PROBLEM_PODS_EXPR = (
|
||||
'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) '
|
||||
"or on() vector(0)"
|
||||
)
|
||||
CRASHLOOP_EXPR = (
|
||||
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
|
||||
'{reason=~"CrashLoopBackOff|ImagePullBackOff"}))'
|
||||
'{reason=~"CrashLoopBackOff|ImagePullBackOff"})) '
|
||||
"or on() vector(0)"
|
||||
)
|
||||
STUCK_TERMINATING_EXPR = (
|
||||
'sum(max by (namespace,pod) ('
|
||||
'((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
|
||||
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
|
||||
'))'
|
||||
')) '
|
||||
"or on() vector(0)"
|
||||
)
|
||||
UPTIME_WINDOW = "30d"
|
||||
TRAEFIK_READY_EXPR = (
|
||||
@ -549,7 +554,7 @@ def pie_panel(panel_id, title, expr, grid):
|
||||
"options": {
|
||||
"legend": {"displayMode": "list", "placement": "right"},
|
||||
"pieType": "pie",
|
||||
"displayLabels": ["percent"],
|
||||
"displayLabels": ["name", "percent"],
|
||||
"tooltip": {"mode": "single"},
|
||||
"colorScheme": "interpolateSpectral",
|
||||
"colorBy": "value",
|
||||
@ -569,6 +574,7 @@ def bargauge_panel(
|
||||
limit=None,
|
||||
thresholds=None,
|
||||
decimals=None,
|
||||
instant=False,
|
||||
):
|
||||
"""Return a bar gauge panel with label-aware reduction."""
|
||||
panel = {
|
||||
@ -577,7 +583,9 @@ def bargauge_panel(
|
||||
"title": title,
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": grid,
|
||||
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}],
|
||||
"targets": [
|
||||
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": unit,
|
||||
@ -675,7 +683,13 @@ def build_overview():
|
||||
"title": "Control Plane Workloads",
|
||||
"expr": CONTROL_WORKLOADS_EXPR,
|
||||
"kind": "stat",
|
||||
"thresholds": count_thresholds,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "red", "value": 1},
|
||||
],
|
||||
},
|
||||
"links": link_to("atlas-pods"),
|
||||
},
|
||||
{
|
||||
@ -683,7 +697,13 @@ def build_overview():
|
||||
"title": "Stuck Terminating",
|
||||
"expr": STUCK_TERMINATING_EXPR,
|
||||
"kind": "stat",
|
||||
"thresholds": count_thresholds,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "red", "value": 1},
|
||||
],
|
||||
},
|
||||
"links": link_to("atlas-pods"),
|
||||
},
|
||||
{
|
||||
@ -701,7 +721,13 @@ def build_overview():
|
||||
"title": "Problem Pods",
|
||||
"expr": PROBLEM_PODS_EXPR,
|
||||
"kind": "stat",
|
||||
"thresholds": count_thresholds,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "red", "value": 1},
|
||||
],
|
||||
},
|
||||
"links": link_to("atlas-pods"),
|
||||
},
|
||||
{
|
||||
@ -709,7 +735,13 @@ def build_overview():
|
||||
"title": "CrashLoop / ImagePull",
|
||||
"expr": CRASHLOOP_EXPR,
|
||||
"kind": "stat",
|
||||
"thresholds": count_thresholds,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "red", "value": 1},
|
||||
],
|
||||
},
|
||||
"links": link_to("atlas-pods"),
|
||||
},
|
||||
{
|
||||
@ -894,7 +926,7 @@ def build_overview():
|
||||
panels.append(
|
||||
pie_panel(
|
||||
28,
|
||||
"Pods by Node",
|
||||
"Node Pod Share",
|
||||
'sum(kube_pod_info{pod!="" , node!=""}) by (node)',
|
||||
{"h": 10, "w": 12, "x": 0, "y": 54},
|
||||
)
|
||||
@ -917,6 +949,7 @@ def build_overview():
|
||||
{"color": "red", "value": 100},
|
||||
],
|
||||
},
|
||||
instant=True,
|
||||
)
|
||||
)
|
||||
|
||||
@ -1102,7 +1135,7 @@ def build_pods_dashboard():
|
||||
panels.append(
|
||||
pie_panel(
|
||||
8,
|
||||
"Pods by Node",
|
||||
"Node Pod Share",
|
||||
'sum(kube_pod_info{pod!="" , node!=""}) by (node)',
|
||||
{"h": 8, "w": 12, "x": 12, "y": 34},
|
||||
)
|
||||
@ -1125,6 +1158,7 @@ def build_pods_dashboard():
|
||||
{"color": "red", "value": 100},
|
||||
],
|
||||
},
|
||||
instant=True,
|
||||
)
|
||||
)
|
||||
return {
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
},
|
||||
"pieType": "pie",
|
||||
"displayLabels": [
|
||||
"name",
|
||||
"percent"
|
||||
],
|
||||
"tooltip": {
|
||||
|
||||
@ -93,17 +93,9 @@
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 3
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -151,7 +143,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
|
||||
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -168,17 +160,9 @@
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 3
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -295,7 +279,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -312,17 +296,9 @@
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 3
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -370,7 +346,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -387,17 +363,9 @@
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 3
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1087,6 +1055,7 @@
|
||||
},
|
||||
"pieType": "pie",
|
||||
"displayLabels": [
|
||||
"name",
|
||||
"percent"
|
||||
],
|
||||
"tooltip": {
|
||||
@ -1140,6 +1109,7 @@
|
||||
},
|
||||
"pieType": "pie",
|
||||
"displayLabels": [
|
||||
"name",
|
||||
"percent"
|
||||
],
|
||||
"tooltip": {
|
||||
@ -1193,6 +1163,7 @@
|
||||
},
|
||||
"pieType": "pie",
|
||||
"displayLabels": [
|
||||
"name",
|
||||
"percent"
|
||||
],
|
||||
"tooltip": {
|
||||
@ -1380,7 +1351,7 @@
|
||||
{
|
||||
"id": 28,
|
||||
"type": "piechart",
|
||||
"title": "Pods by Node",
|
||||
"title": "Node Pod Share",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1414,6 +1385,7 @@
|
||||
},
|
||||
"pieType": "pie",
|
||||
"displayLabels": [
|
||||
"name",
|
||||
"percent"
|
||||
],
|
||||
"tooltip": {
|
||||
@ -1448,7 +1420,8 @@
|
||||
{
|
||||
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -80,7 +80,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -140,7 +140,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
|
||||
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -363,7 +363,7 @@
|
||||
{
|
||||
"id": 8,
|
||||
"type": "piechart",
|
||||
"title": "Pods by Node",
|
||||
"title": "Node Pod Share",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -397,6 +397,7 @@
|
||||
},
|
||||
"pieType": "pie",
|
||||
"displayLabels": [
|
||||
"name",
|
||||
"percent"
|
||||
],
|
||||
"tooltip": {
|
||||
@ -431,7 +432,8 @@
|
||||
{
|
||||
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
|
||||
@ -50,6 +50,7 @@ data:
|
||||
},
|
||||
"pieType": "pie",
|
||||
"displayLabels": [
|
||||
"name",
|
||||
"percent"
|
||||
],
|
||||
"tooltip": {
|
||||
|
||||
@ -102,17 +102,9 @@ data:
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 3
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -160,7 +152,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
|
||||
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -177,17 +169,9 @@ data:
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 3
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -304,7 +288,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -321,17 +305,9 @@ data:
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 3
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -379,7 +355,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -396,17 +372,9 @@ data:
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 3
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1096,6 +1064,7 @@ data:
|
||||
},
|
||||
"pieType": "pie",
|
||||
"displayLabels": [
|
||||
"name",
|
||||
"percent"
|
||||
],
|
||||
"tooltip": {
|
||||
@ -1149,6 +1118,7 @@ data:
|
||||
},
|
||||
"pieType": "pie",
|
||||
"displayLabels": [
|
||||
"name",
|
||||
"percent"
|
||||
],
|
||||
"tooltip": {
|
||||
@ -1202,6 +1172,7 @@ data:
|
||||
},
|
||||
"pieType": "pie",
|
||||
"displayLabels": [
|
||||
"name",
|
||||
"percent"
|
||||
],
|
||||
"tooltip": {
|
||||
@ -1389,7 +1360,7 @@ data:
|
||||
{
|
||||
"id": 28,
|
||||
"type": "piechart",
|
||||
"title": "Pods by Node",
|
||||
"title": "Node Pod Share",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -1423,6 +1394,7 @@ data:
|
||||
},
|
||||
"pieType": "pie",
|
||||
"displayLabels": [
|
||||
"name",
|
||||
"percent"
|
||||
],
|
||||
"tooltip": {
|
||||
@ -1457,7 +1429,8 @@ data:
|
||||
{
|
||||
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
|
||||
@ -29,7 +29,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -89,7 +89,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
|
||||
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -149,7 +149,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
|
||||
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -372,7 +372,7 @@ data:
|
||||
{
|
||||
"id": 8,
|
||||
"type": "piechart",
|
||||
"title": "Pods by Node",
|
||||
"title": "Node Pod Share",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -406,6 +406,7 @@ data:
|
||||
},
|
||||
"pieType": "pie",
|
||||
"displayLabels": [
|
||||
"name",
|
||||
"percent"
|
||||
],
|
||||
"tooltip": {
|
||||
@ -440,7 +441,8 @@ data:
|
||||
{
|
||||
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
"legendFormat": "{{node}}",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user