From e87d54f19db646f150ba30e833988db47b7e7d92 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 13 Dec 2025 15:51:45 -0300 Subject: [PATCH] atlas pods: per-namespace top node via topk --- scripts/dashboards_render_atlas.py | 25 +++++++++---------- .../monitoring/dashboards/atlas-pods.json | 2 +- .../monitoring/grafana-dashboard-pods.yaml | 2 +- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 3c69900..d058522 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -219,6 +219,12 @@ CONTROL_READY_FRACTION_EXPR = ( UPTIME_AVAIL_EXPR = ( f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))" ) + +# Tie-breaker to deterministically pick one node per namespace when shares tie. +NODE_TIEBREAKER = " + ".join( + f"({node_filter(node)}) * 1e-6 * {idx}" + for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1) +) UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:5m])" UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))" @@ -1173,23 +1179,15 @@ def build_pods_dashboard(): instant=True, ) ) + panels.append( table_panel( 10, "Namespace Plurality by Node", - ( - "max by (namespace,node) (" - " {share}" - " * on(namespace) group_left(node) (" - " {share} == bool on(namespace) group_left() (max by (namespace) ({share}))" - " )" - ")" - ).format( - share=( - "(sum by (namespace,node) (kube_pod_info{pod!=\"\"}) " - "/ on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100)" - ) - ), + 'topk by (namespace) (1, ' + '(sum by (namespace,node) (kube_pod_info{pod!=""}) ' + '/ on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=""}), 1) * 100)' + ')', {"h": 8, "w": 24, "x": 0, "y": 42}, unit="percent", transformations=[ @@ -1199,6 +1197,7 @@ def build_pods_dashboard(): instant=True, ) ) + return { "uid": "atlas-pods", "title": "Atlas Pods", diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index b5cfdee..ac4e771 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -508,7 +508,7 @@ }, "targets": [ { - "expr": "max by (namespace,node) ( (sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace) group_left(node) ( (sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100))) ))", + "expr": "topk by (namespace) (1, (sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100))", "refId": "A", "instant": true } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 798cf7e..392a03a 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -517,7 +517,7 @@ data: }, "targets": [ { - "expr": "max by (namespace,node) ( (sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace) group_left(node) ( (sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100))) ))", + "expr": "topk by (namespace) (1, (sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100))", "refId": "A", "instant": true }