atlas dashboards: show pod counts (not %) and make zero-friendly stats

This commit is contained in:
Brad Stein 2025-12-12 20:30:00 -03:00
parent b770575b42
commit 056b7b7770
7 changed files with 89 additions and 103 deletions

View File

@ -187,16 +187,21 @@ def namespace_gpu_share_expr():
return namespace_share_expr(NAMESPACE_GPU_RAW)
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
PROBLEM_PODS_EXPR = (
'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) '
"or on() vector(0)"
)
CRASHLOOP_EXPR = (
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
'{reason=~"CrashLoopBackOff|ImagePullBackOff"})) '
"or on() vector(0)"
)
STUCK_TERMINATING_EXPR = (
'sum(max by (namespace,pod) ('
'((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
')) '
"or on() vector(0)"
)
UPTIME_WINDOW = "30d"
TRAEFIK_READY_EXPR = (
@ -549,7 +554,7 @@ def pie_panel(panel_id, title, expr, grid):
"options": {
"legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie",
"displayLabels": ["percent"],
"displayLabels": ["name", "percent"],
"tooltip": {"mode": "single"},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
@ -569,6 +574,7 @@ def bargauge_panel(
limit=None,
thresholds=None,
decimals=None,
instant=False,
):
"""Return a bar gauge panel with label-aware reduction."""
panel = {
@ -577,7 +583,9 @@ def bargauge_panel(
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}],
"targets": [
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
],
"fieldConfig": {
"defaults": {
"unit": unit,
@ -675,7 +683,13 @@ def build_overview():
"title": "Control Plane Workloads",
"expr": CONTROL_WORKLOADS_EXPR,
"kind": "stat",
"thresholds": count_thresholds,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
"links": link_to("atlas-pods"),
},
{
@ -683,7 +697,13 @@ def build_overview():
"title": "Stuck Terminating",
"expr": STUCK_TERMINATING_EXPR,
"kind": "stat",
"thresholds": count_thresholds,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
"links": link_to("atlas-pods"),
},
{
@ -701,7 +721,13 @@ def build_overview():
"title": "Problem Pods",
"expr": PROBLEM_PODS_EXPR,
"kind": "stat",
"thresholds": count_thresholds,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
"links": link_to("atlas-pods"),
},
{
@ -709,7 +735,13 @@ def build_overview():
"title": "CrashLoop / ImagePull",
"expr": CRASHLOOP_EXPR,
"kind": "stat",
"thresholds": count_thresholds,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
"links": link_to("atlas-pods"),
},
{
@ -894,7 +926,7 @@ def build_overview():
panels.append(
pie_panel(
28,
"Pods by Node",
"Node Pod Share",
'sum(kube_pod_info{pod!="" , node!=""}) by (node)',
{"h": 10, "w": 12, "x": 0, "y": 54},
)
@ -917,6 +949,7 @@ def build_overview():
{"color": "red", "value": 100},
],
},
instant=True,
)
)
@ -1102,7 +1135,7 @@ def build_pods_dashboard():
panels.append(
pie_panel(
8,
"Pods by Node",
"Node Pod Share",
'sum(kube_pod_info{pod!="" , node!=""}) by (node)',
{"h": 8, "w": 12, "x": 12, "y": 34},
)
@ -1125,6 +1158,7 @@ def build_pods_dashboard():
{"color": "red", "value": 100},
],
},
instant=True,
)
)
return {

View File

@ -41,6 +41,7 @@
},
"pieType": "pie",
"displayLabels": [
"name",
"percent"
],
"tooltip": {

View File

@ -93,17 +93,9 @@
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
"value": 1
}
]
},
@ -151,7 +143,7 @@
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
@ -168,17 +160,9 @@
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
"value": 1
}
]
},
@ -295,7 +279,7 @@
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
@ -312,17 +296,9 @@
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
"value": 1
}
]
},
@ -370,7 +346,7 @@
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
@ -387,17 +363,9 @@
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
"value": 1
}
]
},
@ -1087,6 +1055,7 @@
},
"pieType": "pie",
"displayLabels": [
"name",
"percent"
],
"tooltip": {
@ -1140,6 +1109,7 @@
},
"pieType": "pie",
"displayLabels": [
"name",
"percent"
],
"tooltip": {
@ -1193,6 +1163,7 @@
},
"pieType": "pie",
"displayLabels": [
"name",
"percent"
],
"tooltip": {
@ -1380,7 +1351,7 @@
{
"id": 28,
"type": "piechart",
"title": "Pods by Node",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1414,6 +1385,7 @@
},
"pieType": "pie",
"displayLabels": [
"name",
"percent"
],
"tooltip": {
@ -1448,7 +1420,8 @@
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}"
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {

View File

@ -20,7 +20,7 @@
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
@ -80,7 +80,7 @@
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
@ -140,7 +140,7 @@
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
@ -363,7 +363,7 @@
{
"id": 8,
"type": "piechart",
"title": "Pods by Node",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -397,6 +397,7 @@
},
"pieType": "pie",
"displayLabels": [
"name",
"percent"
],
"tooltip": {
@ -431,7 +432,8 @@
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}"
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {

View File

@ -50,6 +50,7 @@ data:
},
"pieType": "pie",
"displayLabels": [
"name",
"percent"
],
"tooltip": {

View File

@ -102,17 +102,9 @@ data:
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
"value": 1
}
]
},
@ -160,7 +152,7 @@ data:
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
@ -177,17 +169,9 @@ data:
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
"value": 1
}
]
},
@ -304,7 +288,7 @@ data:
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
@ -321,17 +305,9 @@ data:
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
"value": 1
}
]
},
@ -379,7 +355,7 @@ data:
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
@ -396,17 +372,9 @@ data:
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 3
"value": 1
}
]
},
@ -1096,6 +1064,7 @@ data:
},
"pieType": "pie",
"displayLabels": [
"name",
"percent"
],
"tooltip": {
@ -1149,6 +1118,7 @@ data:
},
"pieType": "pie",
"displayLabels": [
"name",
"percent"
],
"tooltip": {
@ -1202,6 +1172,7 @@ data:
},
"pieType": "pie",
"displayLabels": [
"name",
"percent"
],
"tooltip": {
@ -1389,7 +1360,7 @@ data:
{
"id": 28,
"type": "piechart",
"title": "Pods by Node",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1423,6 +1394,7 @@ data:
},
"pieType": "pie",
"displayLabels": [
"name",
"percent"
],
"tooltip": {
@ -1457,7 +1429,8 @@ data:
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}"
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {

View File

@ -29,7 +29,7 @@ data:
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A"
}
],
@ -89,7 +89,7 @@ data:
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))",
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A"
}
],
@ -149,7 +149,7 @@ data:
},
"targets": [
{
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))",
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A"
}
],
@ -372,7 +372,7 @@ data:
{
"id": 8,
"type": "piechart",
"title": "Pods by Node",
"title": "Node Pod Share",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -406,6 +406,7 @@ data:
},
"pieType": "pie",
"displayLabels": [
"name",
"percent"
],
"tooltip": {
@ -440,7 +441,8 @@ data:
{
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A",
"legendFormat": "{{node}}"
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {