atlas dashboards: show pod counts (not %) and make zero-friendly stats

This commit is contained in:
Brad Stein 2025-12-12 20:30:00 -03:00
parent b770575b42
commit 056b7b7770
7 changed files with 89 additions and 103 deletions

View File

@ -187,16 +187,21 @@ def namespace_gpu_share_expr():
return namespace_share_expr(NAMESPACE_GPU_RAW) return namespace_share_expr(NAMESPACE_GPU_RAW)
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' PROBLEM_PODS_EXPR = (
'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) '
"or on() vector(0)"
)
CRASHLOOP_EXPR = ( CRASHLOOP_EXPR = (
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
'{reason=~"CrashLoopBackOff|ImagePullBackOff"})) ' '{reason=~"CrashLoopBackOff|ImagePullBackOff"})) '
"or on() vector(0)"
) )
STUCK_TERMINATING_EXPR = ( STUCK_TERMINATING_EXPR = (
'sum(max by (namespace,pod) (' 'sum(max by (namespace,pod) ('
'((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)' '((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)' ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
')) ' ')) '
"or on() vector(0)"
) )
UPTIME_WINDOW = "30d" UPTIME_WINDOW = "30d"
TRAEFIK_READY_EXPR = ( TRAEFIK_READY_EXPR = (
@ -549,7 +554,7 @@ def pie_panel(panel_id, title, expr, grid):
"options": { "options": {
"legend": {"displayMode": "list", "placement": "right"}, "legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie", "pieType": "pie",
"displayLabels": ["percent"], "displayLabels": ["name", "percent"],
"tooltip": {"mode": "single"}, "tooltip": {"mode": "single"},
"colorScheme": "interpolateSpectral", "colorScheme": "interpolateSpectral",
"colorBy": "value", "colorBy": "value",
@ -569,6 +574,7 @@ def bargauge_panel(
limit=None, limit=None,
thresholds=None, thresholds=None,
decimals=None, decimals=None,
instant=False,
): ):
"""Return a bar gauge panel with label-aware reduction.""" """Return a bar gauge panel with label-aware reduction."""
panel = { panel = {
@ -577,7 +583,9 @@ def bargauge_panel(
"title": title, "title": title,
"datasource": PROM_DS, "datasource": PROM_DS,
"gridPos": grid, "gridPos": grid,
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}], "targets": [
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": unit, "unit": unit,
@ -675,7 +683,13 @@ def build_overview():
"title": "Control Plane Workloads", "title": "Control Plane Workloads",
"expr": CONTROL_WORKLOADS_EXPR, "expr": CONTROL_WORKLOADS_EXPR,
"kind": "stat", "kind": "stat",
"thresholds": count_thresholds, "thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
"links": link_to("atlas-pods"), "links": link_to("atlas-pods"),
}, },
{ {
@ -683,7 +697,13 @@ def build_overview():
"title": "Stuck Terminating", "title": "Stuck Terminating",
"expr": STUCK_TERMINATING_EXPR, "expr": STUCK_TERMINATING_EXPR,
"kind": "stat", "kind": "stat",
"thresholds": count_thresholds, "thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
"links": link_to("atlas-pods"), "links": link_to("atlas-pods"),
}, },
{ {
@ -701,7 +721,13 @@ def build_overview():
"title": "Problem Pods", "title": "Problem Pods",
"expr": PROBLEM_PODS_EXPR, "expr": PROBLEM_PODS_EXPR,
"kind": "stat", "kind": "stat",
"thresholds": count_thresholds, "thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
"links": link_to("atlas-pods"), "links": link_to("atlas-pods"),
}, },
{ {
@ -709,7 +735,13 @@ def build_overview():
"title": "CrashLoop / ImagePull", "title": "CrashLoop / ImagePull",
"expr": CRASHLOOP_EXPR, "expr": CRASHLOOP_EXPR,
"kind": "stat", "kind": "stat",
"thresholds": count_thresholds, "thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
"links": link_to("atlas-pods"), "links": link_to("atlas-pods"),
}, },
{ {
@ -894,7 +926,7 @@ def build_overview():
panels.append( panels.append(
pie_panel( pie_panel(
28, 28,
"Pods by Node", "Node Pod Share",
'sum(kube_pod_info{pod!="" , node!=""}) by (node)', 'sum(kube_pod_info{pod!="" , node!=""}) by (node)',
{"h": 10, "w": 12, "x": 0, "y": 54}, {"h": 10, "w": 12, "x": 0, "y": 54},
) )
@ -917,6 +949,7 @@ def build_overview():
{"color": "red", "value": 100}, {"color": "red", "value": 100},
], ],
}, },
instant=True,
) )
) )
@ -1102,7 +1135,7 @@ def build_pods_dashboard():
panels.append( panels.append(
pie_panel( pie_panel(
8, 8,
"Pods by Node", "Node Pod Share",
'sum(kube_pod_info{pod!="" , node!=""}) by (node)', 'sum(kube_pod_info{pod!="" , node!=""}) by (node)',
{"h": 8, "w": 12, "x": 12, "y": 34}, {"h": 8, "w": 12, "x": 12, "y": 34},
) )
@ -1125,6 +1158,7 @@ def build_pods_dashboard():
{"color": "red", "value": 100}, {"color": "red", "value": 100},
], ],
}, },
instant=True,
) )
) )
return { return {

View File

@ -41,6 +41,7 @@
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [
"name",
"percent" "percent"
], ],
"tooltip": { "tooltip": {

View File

@ -93,17 +93,9 @@
"color": "green", "color": "green",
"value": null "value": null
}, },
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{ {
"color": "red", "color": "red",
"value": 3 "value": 1
} }
] ]
}, },
@ -151,7 +143,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -168,17 +160,9 @@
"color": "green", "color": "green",
"value": null "value": null
}, },
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{ {
"color": "red", "color": "red",
"value": 3 "value": 1
} }
] ]
}, },
@ -295,7 +279,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -312,17 +296,9 @@
"color": "green", "color": "green",
"value": null "value": null
}, },
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{ {
"color": "red", "color": "red",
"value": 3 "value": 1
} }
] ]
}, },
@ -370,7 +346,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -387,17 +363,9 @@
"color": "green", "color": "green",
"value": null "value": null
}, },
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{ {
"color": "red", "color": "red",
"value": 3 "value": 1
} }
] ]
}, },
@ -1087,6 +1055,7 @@
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [
"name",
"percent" "percent"
], ],
"tooltip": { "tooltip": {
@ -1140,6 +1109,7 @@
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [
"name",
"percent" "percent"
], ],
"tooltip": { "tooltip": {
@ -1193,6 +1163,7 @@
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [
"name",
"percent" "percent"
], ],
"tooltip": { "tooltip": {
@ -1380,7 +1351,7 @@
{ {
"id": 28, "id": 28,
"type": "piechart", "type": "piechart",
"title": "Pods by Node", "title": "Node Pod Share",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1414,6 +1385,7 @@
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [
"name",
"percent" "percent"
], ],
"tooltip": { "tooltip": {
@ -1448,7 +1420,8 @@
{ {
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {

View File

@ -20,7 +20,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -80,7 +80,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -140,7 +140,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -363,7 +363,7 @@
{ {
"id": 8, "id": 8,
"type": "piechart", "type": "piechart",
"title": "Pods by Node", "title": "Node Pod Share",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -397,6 +397,7 @@
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [
"name",
"percent" "percent"
], ],
"tooltip": { "tooltip": {
@ -431,7 +432,8 @@
{ {
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {

View File

@ -50,6 +50,7 @@ data:
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [
"name",
"percent" "percent"
], ],
"tooltip": { "tooltip": {

View File

@ -102,17 +102,9 @@ data:
"color": "green", "color": "green",
"value": null "value": null
}, },
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{ {
"color": "red", "color": "red",
"value": 3 "value": 1
} }
] ]
}, },
@ -160,7 +152,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -177,17 +169,9 @@ data:
"color": "green", "color": "green",
"value": null "value": null
}, },
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{ {
"color": "red", "color": "red",
"value": 3 "value": 1
} }
] ]
}, },
@ -304,7 +288,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -321,17 +305,9 @@ data:
"color": "green", "color": "green",
"value": null "value": null
}, },
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{ {
"color": "red", "color": "red",
"value": 3 "value": 1
} }
] ]
}, },
@ -379,7 +355,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -396,17 +372,9 @@ data:
"color": "green", "color": "green",
"value": null "value": null
}, },
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{ {
"color": "red", "color": "red",
"value": 3 "value": 1
} }
] ]
}, },
@ -1096,6 +1064,7 @@ data:
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [
"name",
"percent" "percent"
], ],
"tooltip": { "tooltip": {
@ -1149,6 +1118,7 @@ data:
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [
"name",
"percent" "percent"
], ],
"tooltip": { "tooltip": {
@ -1202,6 +1172,7 @@ data:
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [
"name",
"percent" "percent"
], ],
"tooltip": { "tooltip": {
@ -1389,7 +1360,7 @@ data:
{ {
"id": 28, "id": 28,
"type": "piechart", "type": "piechart",
"title": "Pods by Node", "title": "Node Pod Share",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1423,6 +1394,7 @@ data:
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [
"name",
"percent" "percent"
], ],
"tooltip": { "tooltip": {
@ -1457,7 +1429,8 @@ data:
{ {
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {

View File

@ -29,7 +29,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -89,7 +89,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -149,7 +149,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -372,7 +372,7 @@ data:
{ {
"id": 8, "id": 8,
"type": "piechart", "type": "piechart",
"title": "Pods by Node", "title": "Node Pod Share",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -406,6 +406,7 @@ data:
}, },
"pieType": "pie", "pieType": "pie",
"displayLabels": [ "displayLabels": [
"name",
"percent" "percent"
], ],
"tooltip": { "tooltip": {
@ -440,7 +441,8 @@ data:
{ {
"expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {