From 056b7b777089ee07412224fb7b746cb7aa3b5a19 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 12 Dec 2025 20:30:00 -0300 Subject: [PATCH] atlas dashboards: show pod counts (not %) and make zero-friendly stats --- scripts/dashboards_render_atlas.py | 56 +++++++++++++++---- services/monitoring/dashboards/atlas-gpu.json | 1 + .../monitoring/dashboards/atlas-overview.json | 55 +++++------------- .../monitoring/dashboards/atlas-pods.json | 12 ++-- .../monitoring/grafana-dashboard-gpu.yaml | 1 + .../grafana-dashboard-overview.yaml | 55 +++++------------- .../monitoring/grafana-dashboard-pods.yaml | 12 ++-- 7 files changed, 89 insertions(+), 103 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 1841224..f53302b 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -187,16 +187,21 @@ def namespace_gpu_share_expr(): return namespace_share_expr(NAMESPACE_GPU_RAW) -PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' +PROBLEM_PODS_EXPR = ( + 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) ' + "or on() vector(0)" +) CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' - '{reason=~"CrashLoopBackOff|ImagePullBackOff"}))' + '{reason=~"CrashLoopBackOff|ImagePullBackOff"})) ' + "or on() vector(0)" ) STUCK_TERMINATING_EXPR = ( 'sum(max by (namespace,pod) (' '((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)' ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)' - '))' + ')) ' + "or on() vector(0)" ) UPTIME_WINDOW = "30d" TRAEFIK_READY_EXPR = ( @@ -549,7 +554,7 @@ def pie_panel(panel_id, title, expr, grid): "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", - "displayLabels": ["percent"], + "displayLabels": ["name", "percent"], "tooltip": {"mode": "single"}, "colorScheme": "interpolateSpectral", "colorBy": "value", @@ -569,6 +574,7 @@ def bargauge_panel( limit=None, thresholds=None, decimals=None, + instant=False, ): """Return a bar gauge panel with label-aware reduction.""" panel = { @@ -577,7 +583,9 @@ def bargauge_panel( "title": title, "datasource": PROM_DS, "gridPos": grid, - "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}], + "targets": [ + {"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})} + ], "fieldConfig": { "defaults": { "unit": unit, @@ -675,7 +683,13 @@ def build_overview(): "title": "Control Plane Workloads", "expr": CONTROL_WORKLOADS_EXPR, "kind": "stat", - "thresholds": count_thresholds, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + }, "links": link_to("atlas-pods"), }, { @@ -683,7 +697,13 @@ def build_overview(): "title": "Stuck Terminating", "expr": STUCK_TERMINATING_EXPR, "kind": "stat", - "thresholds": count_thresholds, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + }, "links": link_to("atlas-pods"), }, { @@ -701,7 +721,13 @@ def build_overview(): "title": "Problem Pods", "expr": PROBLEM_PODS_EXPR, "kind": "stat", - "thresholds": count_thresholds, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + }, "links": link_to("atlas-pods"), }, { @@ -709,7 +735,13 @@ def build_overview(): "title": "CrashLoop / ImagePull", "expr": CRASHLOOP_EXPR, "kind": "stat", - "thresholds": count_thresholds, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + }, "links": link_to("atlas-pods"), }, { @@ -894,7 +926,7 @@ def build_overview(): panels.append( pie_panel( 28, - "Pods by Node", + "Node Pod Share", 'sum(kube_pod_info{pod!="" , node!=""}) by (node)', {"h": 10, "w": 12, "x": 0, "y": 54}, ) @@ -917,6 +949,7 @@ def build_overview(): {"color": "red", "value": 100}, ], }, + instant=True, ) ) @@ -1102,7 +1135,7 @@ def build_pods_dashboard(): panels.append( pie_panel( 8, - "Pods by Node", + "Node Pod Share", 'sum(kube_pod_info{pod!="" , node!=""}) by (node)', {"h": 8, "w": 12, "x": 12, "y": 34}, ) @@ -1125,6 +1158,7 @@ def build_pods_dashboard(): {"color": "red", "value": 100}, ], }, + instant=True, ) ) return { diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 9071b0a..558aa63 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -41,6 +41,7 @@ }, "pieType": "pie", "displayLabels": [ + "name", "percent" ], "tooltip": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index b4c37af..d269e6d 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -93,17 +93,9 @@ "color": "green", "value": null }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, { "color": "red", - "value": 3 + "value": 1 } ] }, @@ -151,7 +143,7 @@ }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)", "refId": "A" } ], @@ -168,17 +160,9 @@ "color": "green", "value": null }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, { "color": "red", - "value": 3 + "value": 1 } ] }, @@ -295,7 +279,7 @@ }, "targets": [ { - "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", + "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)", "refId": "A" } ], @@ -312,17 +296,9 @@ "color": "green", "value": null }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, { "color": "red", - "value": 3 + "value": 1 } ] }, @@ -370,7 +346,7 @@ }, "targets": [ { - "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", + "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)", "refId": "A" } ], @@ -387,17 +363,9 @@ "color": "green", "value": null }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, { "color": "red", - "value": 3 + "value": 1 } ] }, @@ -1087,6 +1055,7 @@ }, "pieType": "pie", "displayLabels": [ + "name", "percent" ], "tooltip": { @@ -1140,6 +1109,7 @@ }, "pieType": "pie", "displayLabels": [ + "name", "percent" ], "tooltip": { @@ -1193,6 +1163,7 @@ }, "pieType": "pie", "displayLabels": [ + "name", "percent" ], "tooltip": { @@ -1380,7 +1351,7 @@ { "id": 28, "type": "piechart", - "title": "Pods by Node", + "title": "Node Pod Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1414,6 +1385,7 @@ }, "pieType": "pie", "displayLabels": [ + "name", "percent" ], "tooltip": { @@ -1448,7 +1420,8 @@ { "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index 0e497de..0b139e9 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", + "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)", "refId": "A" } ], @@ -80,7 +80,7 @@ }, "targets": [ { - "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", + "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)", "refId": "A" } ], @@ -140,7 +140,7 @@ }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)", "refId": "A" } ], @@ -363,7 +363,7 @@ { "id": 8, "type": "piechart", - "title": "Pods by Node", + "title": "Node Pod Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -397,6 +397,7 @@ }, "pieType": "pie", "displayLabels": [ + "name", "percent" ], "tooltip": { @@ -431,7 +432,8 @@ { "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index b5c2c18..fee58ed 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -50,6 +50,7 @@ data: }, "pieType": "pie", "displayLabels": [ + "name", "percent" ], "tooltip": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index bec60ae..344ff5f 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -102,17 +102,9 @@ data: "color": "green", "value": null }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, { "color": "red", - "value": 3 + "value": 1 } ] }, @@ -160,7 +152,7 @@ data: }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)", "refId": "A" } ], @@ -177,17 +169,9 @@ data: "color": "green", "value": null }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, { "color": "red", - "value": 3 + "value": 1 } ] }, @@ -304,7 +288,7 @@ data: }, "targets": [ { - "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", + "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)", "refId": "A" } ], @@ -321,17 +305,9 @@ data: "color": "green", "value": null }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, { "color": "red", - "value": 3 + "value": 1 } ] }, @@ -379,7 +355,7 @@ data: }, "targets": [ { - "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", + "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)", "refId": "A" } ], @@ -396,17 +372,9 @@ data: "color": "green", "value": null }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 2 - }, { "color": "red", - "value": 3 + "value": 1 } ] }, @@ -1096,6 +1064,7 @@ data: }, "pieType": "pie", "displayLabels": [ + "name", "percent" ], "tooltip": { @@ -1149,6 +1118,7 @@ data: }, "pieType": "pie", "displayLabels": [ + "name", "percent" ], "tooltip": { @@ -1202,6 +1172,7 @@ data: }, "pieType": "pie", "displayLabels": [ + "name", "percent" ], "tooltip": { @@ -1389,7 +1360,7 @@ data: { "id": 28, "type": "piechart", - "title": "Pods by Node", + "title": "Node Pod Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1423,6 +1394,7 @@ data: }, "pieType": "pie", "displayLabels": [ + "name", "percent" ], "tooltip": { @@ -1457,7 +1429,8 @@ data: { "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 3eb80b2..216ef09 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", + "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})) or on() vector(0)", "refId": "A" } ], @@ -89,7 +89,7 @@ data: }, "targets": [ { - "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", + "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})) or on() vector(0)", "refId": "A" } ], @@ -149,7 +149,7 @@ data: }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0))) or on() vector(0)", "refId": "A" } ], @@ -372,7 +372,7 @@ data: { "id": 8, "type": "piechart", - "title": "Pods by Node", + "title": "Node Pod Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -406,6 +406,7 @@ data: }, "pieType": "pie", "displayLabels": [ + "name", "percent" ], "tooltip": { @@ -440,7 +441,8 @@ data: { "expr": "topk(12, sum(kube_pod_info{pod!=\"\" , node!=\"\"}) by (node))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": {