From 68d4f4390314a5832c4b8102c9a12b0b47133954 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 13 Dec 2025 15:11:21 -0300 Subject: [PATCH] atlas pods: stabilize plurality query to avoid 422 --- scripts/dashboards_render_atlas.py | 9 ++---- services/monitoring/README.md | 28 ------------------- .../monitoring/dashboards/atlas-pods.json | 2 +- .../monitoring/grafana-dashboard-pods.yaml | 2 +- 4 files changed, 5 insertions(+), 36 deletions(-) delete mode 100644 services/monitoring/README.md diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index ba11062..7692bdb 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -1178,13 +1178,10 @@ def build_pods_dashboard(): 10, "Namespace Plurality by Node", ( - "(" + "max by (namespace,node) (" " {share}" - ")" - "* on(namespace) group_left(node) (" - " ({share})" - " == bool on(namespace) group_left() (" - " max by (namespace) ({share})" + " * on(namespace) group_left(node) (" + " {share} == bool on(namespace) group_left() (max by (namespace) ({share}))" " )" ")" ).format( diff --git a/services/monitoring/README.md b/services/monitoring/README.md deleted file mode 100644 index 835ae1d..0000000 --- a/services/monitoring/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# services/monitoring - -## Grafana admin secret - -The Grafana Helm release expects a pre-existing secret named `grafana-admin` -in the `monitoring` namespace. Create or rotate it with: - -```bash -kubectl create secret generic grafana-admin \ - --namespace monitoring \ - --from-literal=admin-user=admin \ - --from-literal=admin-password='REPLACE_ME' -``` - -Update the password whenever you rotate credentials. - -## DCGM exporter image - -The NVIDIA GPU metrics DaemonSet expects `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`, mirrored from `docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`. Refresh it in Zot when bumping versions: - -```bash -skopeo copy \ - --all \ - docker://docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 \ - docker://registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 -``` - -When finished mirroring from the control-plane, you can remove temporary tooling with `sudo apt-get purge -y skopeo && sudo apt-get autoremove -y` and clear `~/.config/containers/auth.json`. diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index 42d6c34..36c8767 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -508,7 +508,7 @@ }, "targets": [ { - "expr": "( (sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100))* on(namespace) group_left(node) ( ((sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100)) == bool on(namespace) group_left() ( max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100)) ))", + "expr": "max by (namespace,node) ( (sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace) group_left(node) ( (sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100))) ))", "refId": "A", "instant": true } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 51041be..e0a6bfe 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -517,7 +517,7 @@ data: }, "targets": [ { - "expr": "( (sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100))* on(namespace) group_left(node) ( ((sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100)) == bool on(namespace) group_left() ( max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100)) ))", + "expr": "max by (namespace,node) ( (sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) * on(namespace) group_left(node) ( (sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100) == bool on(namespace) group_left() (max by (namespace) ((sum by (namespace,node) (kube_pod_info{pod!=\"\"}) / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1) * 100))) ))", "refId": "A", "instant": true }