From 06337f2b9d4c05f923169e9e232700a6baf0c35c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 14 Nov 2025 00:02:59 -0300 Subject: [PATCH 01/71] monitoring: add grafana and alertmanager --- clusters/atlas/flux-system/gotk-sync.yaml | 2 +- services/monitoring/README.md | 15 ++ .../monitoring/grafana-dashboard-public.yaml | 227 ++++++++++++++++++ .../monitoring/grafana-dashboard-sre.yaml | 223 +++++++++++++++++ services/monitoring/helmrelease.yaml | 131 ++++++++++ services/monitoring/kustomization.yaml | 2 + 6 files changed, 599 insertions(+), 1 deletion(-) create mode 100644 services/monitoring/README.md create mode 100644 services/monitoring/grafana-dashboard-public.yaml create mode 100644 services/monitoring/grafana-dashboard-sre.yaml diff --git a/clusters/atlas/flux-system/gotk-sync.yaml b/clusters/atlas/flux-system/gotk-sync.yaml index 473ab99..46f65d3 100644 --- a/clusters/atlas/flux-system/gotk-sync.yaml +++ b/clusters/atlas/flux-system/gotk-sync.yaml @@ -8,7 +8,7 @@ metadata: spec: interval: 1m0s ref: - branch: main + branch: feature/atlas-monitoring secretRef: name: flux-system-gitea url: ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git diff --git a/services/monitoring/README.md b/services/monitoring/README.md new file mode 100644 index 0000000..74baf08 --- /dev/null +++ b/services/monitoring/README.md @@ -0,0 +1,15 @@ +# services/monitoring + +## Grafana admin secret + +The Grafana Helm release expects a pre-existing secret named `grafana-admin` +in the `monitoring` namespace. Create or rotate it with: + +```bash +kubectl create secret generic grafana-admin \ + --namespace monitoring \ + --from-literal=admin-user=admin \ + --from-literal=admin-password='REPLACE_ME' +``` + +Update the password whenever you rotate credentials. diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-public.yaml new file mode 100644 index 0000000..db5d6c1 --- /dev/null +++ b/services/monitoring/grafana-dashboard-public.yaml @@ -0,0 +1,227 @@ +# services/monitoring/grafana-dashboard-public.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-public + labels: + grafana_dashboard: "1" +data: + atlas-public-overview.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": false, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "editorMode": "code", + "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Running pods", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "description": "Aggregated CPU usage across all schedulable nodes.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-BlYlRd" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 60 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "avg(100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Average node CPU", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace!=\"\", container!=\"\"}[5m])) by (namespace)", + "legendFormat": "{{namespace}}", + "refId": "A" + } + ], + "title": "Namespace CPU (5m avg)", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "public" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Atlas Public Overview", + "uid": "atlas-public", + "version": 1, + "weekStart": "" + } diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-sre.yaml new file mode 100644 index 0000000..12995af --- /dev/null +++ b/services/monitoring/grafana-dashboard-sre.yaml @@ -0,0 +1,223 @@ +# services/monitoring/grafana-dashboard-sre.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-sre + labels: + grafana_dashboard: "1" +data: + atlas-sre-overview.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "description": "Percentage of Ready nodes.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "avg(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) * 100", + "refId": "A" + } + ], + "title": "Ready nodes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 11, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "sum by (node)(node_filesystem_avail_bytes{mountpoint=\"/\"})", + "legendFormat": "{{node}}", + "refId": "A" + } + ], + "title": "Free root filesystem bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"crypto\",container!=\"\"}[5m])) by (pod)", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Crypto namespace CPU usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 13, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": false + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "count(sum(kube_pod_status_phase{phase=\"Failed\"}) by (namespace))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Namespaces with failed pods", + "type": "bargauge" + } + ], + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "sre" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": {}, + "title": "Atlas SRE Overview", + "uid": "atlas-sre", + "version": 1 + } diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 22bc2b1..3341e9d 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -210,3 +210,134 @@ spec: - action: keep source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of] regex: flux-system;flux + +--- + +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: grafana + namespace: monitoring +spec: + interval: 15m + chart: + spec: + chart: grafana + version: "~8.5.0" + sourceRef: + kind: HelmRepository + name: grafana + namespace: flux-system + values: + admin: + existingSecret: grafana-admin + userKey: admin-user + passwordKey: admin-password + persistence: + enabled: true + size: 20Gi + storageClassName: astreae + service: + type: ClusterIP + env: + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" + - name: GF_AUTH_ANONYMOUS_ORG_ROLE + value: Viewer + - name: GF_SECURITY_ALLOW_EMBEDDING + value: "true" + grafana.ini: + server: + domain: reporting.bstein.dev + root_url: https://reporting.bstein.dev/ + auth.anonymous: + hide_version: true + users: + default_theme: dark + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt + hosts: + - reporting.bstein.dev + tls: + - secretName: grafana-reporting-tls + hosts: + - reporting.bstein.dev + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: VictoriaMetrics + type: prometheus + access: proxy + url: http://victoria-metrics-single-server:8428 + isDefault: true + jsonData: + timeInterval: "15s" + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: public + orgId: 1 + folder: Atlas Public + type: file + disableDeletion: false + allowUiUpdates: false + options: + path: /var/lib/grafana/dashboards/public + - name: sre + orgId: 1 + folder: Atlas SRE + type: file + disableDeletion: false + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards/sre + dashboardsConfigMaps: + - configMapName: grafana-dashboard-public + folder: public + - configMapName: grafana-dashboard-sre + folder: sre + +--- + +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: alertmanager + namespace: monitoring +spec: + interval: 15m + chart: + spec: + chart: alertmanager + version: "~1.9.0" + sourceRef: + kind: HelmRepository + name: prometheus + namespace: flux-system + values: + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt + hosts: + - alerts.bstein.dev + tls: + - secretName: alerts-bstein-dev-tls + hosts: + - alerts.bstein.dev + config: + global: + resolve_timeout: 5m + route: + receiver: default + group_wait: 30s + group_interval: 5m + repeat_interval: 2h + receivers: + - name: default diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 036afa3..bb321b5 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -5,4 +5,6 @@ namespace: monitoring resources: - namespace.yaml - rbac.yaml + - grafana-dashboard-public.yaml + - grafana-dashboard-sre.yaml - helmrelease.yaml From c2cb9011024806df0449c3c102308ade6a6cb1b8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 14 Nov 2025 08:29:59 -0300 Subject: [PATCH 02/71] monitoring: fix grafana values --- services/monitoring/helmrelease.yaml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 3341e9d..9cac705 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -260,7 +260,9 @@ spec: annotations: cert-manager.io/cluster-issuer: letsencrypt hosts: - - reporting.bstein.dev + - host: reporting.bstein.dev + paths: + - / tls: - secretName: grafana-reporting-tls hosts: @@ -297,10 +299,8 @@ spec: options: path: /var/lib/grafana/dashboards/sre dashboardsConfigMaps: - - configMapName: grafana-dashboard-public - folder: public - - configMapName: grafana-dashboard-sre - folder: sre + public: grafana-dashboard-public + sre: grafana-dashboard-sre --- @@ -326,7 +326,9 @@ spec: annotations: cert-manager.io/cluster-issuer: letsencrypt hosts: - - alerts.bstein.dev + - host: alerts.bstein.dev + paths: + - / tls: - secretName: alerts-bstein-dev-tls hosts: From 465103a57e30d63c2e94f276762b0d8972ec80e7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 14 Nov 2025 08:33:53 -0300 Subject: [PATCH 03/71] grafana: fix dashboard provider list --- services/monitoring/helmrelease.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 9cac705..e9b6154 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -287,7 +287,7 @@ spec: folder: Atlas Public type: file disableDeletion: false - allowUiUpdates: false + editable: false options: path: /var/lib/grafana/dashboards/public - name: sre @@ -295,7 +295,7 @@ spec: folder: Atlas SRE type: file disableDeletion: false - allowUiUpdates: true + editable: true options: path: /var/lib/grafana/dashboards/sre dashboardsConfigMaps: From 394fcf2ee4a7131f38add30029c432ef287c7c8e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 14 Nov 2025 08:37:46 -0300 Subject: [PATCH 04/71] grafana: use string host format --- services/monitoring/helmrelease.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index e9b6154..91cf0ce 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -260,9 +260,8 @@ spec: annotations: cert-manager.io/cluster-issuer: letsencrypt hosts: - - host: reporting.bstein.dev - paths: - - / + - reporting.bstein.dev + path: / tls: - secretName: grafana-reporting-tls hosts: From 418329e17337522bbc27fc7e1e71fb3d061f2278 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 14 Nov 2025 08:51:09 -0300 Subject: [PATCH 05/71] monitoring: fix ingress and env formats --- services/monitoring/helmrelease.yaml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 91cf0ce..b176c64 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -240,12 +240,9 @@ spec: service: type: ClusterIP env: - - name: GF_AUTH_ANONYMOUS_ENABLED - value: "true" - - name: GF_AUTH_ANONYMOUS_ORG_ROLE - value: Viewer - - name: GF_SECURITY_ALLOW_EMBEDDING - value: "true" + GF_AUTH_ANONYMOUS_ENABLED: "true" + GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer + GF_SECURITY_ALLOW_EMBEDDING: "true" grafana.ini: server: domain: reporting.bstein.dev @@ -327,7 +324,8 @@ spec: hosts: - host: alerts.bstein.dev paths: - - / + - path: / + pathType: Prefix tls: - secretName: alerts-bstein-dev-tls hosts: From 3cfe6393872ac21a6a30163480fb8cbb19518226 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 14 Nov 2025 19:13:40 -0300 Subject: [PATCH 06/71] monitoring: fix domain --- scripts/styx_prep_nvme_luks.sh | 575 +++++++++++++++++++++++++++ services/monitoring/helmrelease.yaml | 12 +- 2 files changed, 581 insertions(+), 6 deletions(-) create mode 100755 scripts/styx_prep_nvme_luks.sh diff --git a/scripts/styx_prep_nvme_luks.sh b/scripts/styx_prep_nvme_luks.sh new file mode 100755 index 0000000..d5ea0c5 --- /dev/null +++ b/scripts/styx_prep_nvme_luks.sh @@ -0,0 +1,575 @@ +#!/usr/bin/env bash +set -euo pipefail + +# --- CONFIG (edit if needed) --- +# Leave NVME empty → script will auto-detect the SSK dock. +NVME="${NVME:-}" +FLAVOR="${FLAVOR:-desktop}" +# Persistent cache so the image survives reboots. +IMG_DIR="${IMG_DIR:-/var/cache/styx-rpi}" +IMG_FILE="${IMG_FILE:-ubuntu-24.04.3-preinstalled-${FLAVOR}-arm64+raspi.img}" +IMG_BOOT_MNT="${IMG_BOOT_MNT:-/mnt/img-boot}" +IMG_ROOT_MNT="${IMG_ROOT_MNT:-/mnt/img-root}" +TGT_ROOT="/mnt/target-root" +TGT_BOOT="/mnt/target-boot" + +STYX_USER="styx" +STYX_HOSTNAME="titan-ag" +STYX_PASS="TempPass#123" # will be forced to change on first login via cloud-init +SSH_PUBKEY="ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOb8oMX6u0z3sH/p/WBGlvPXXdbGETCKzWYwR/dd6fZb titan-bastion" + +# Video / input prefs +DSI_FLAGS="video=DSI-1:800x480@60D video=HDMI-A-1:off video=HDMI-A-2:off" + +# --- Helpers --- +fatal(){ echo "ERROR: $*" >&2; exit 1; } +need(){ command -v "$1" >/dev/null || fatal "Missing tool: $1"; } + +require_root(){ [[ $EUID -eq 0 ]] || exec sudo -E "$0" "$@"; } + +part() { + local n="$1" + if [[ "$NVME" =~ [0-9]$ ]]; then + echo "${NVME}p${n}" + else + echo "${NVME}${n}" + fi +} + +auto_detect_target_disk() { + # If user already set NVME, validate and return + if [[ -n "${NVME:-}" ]]; then + [[ -b "$NVME" ]] || fatal "NVME='$NVME' is not a block device" + return + fi + + # Prefer stable by-id symlinks + local byid + byid=$(ls -1 /dev/disk/by-id/usb-SSK* 2>/dev/null | head -n1 || true) + if [[ -n "$byid" ]]; then + NVME=$(readlink -f "$byid") + else + # Heuristic via lsblk -S: look for USB with SSK/Ingram/Storage in vendor/model + NVME=$(lsblk -S -p -o NAME,TRAN,VENDOR,MODEL | \ + awk '/ usb / && ($3 ~ /SSK|Ingram/i || $4 ~ /SSK|Storage/i){print $1; exit}') + fi + + [[ -n "${NVME:-}" && -b "$NVME" ]] || fatal "Could not auto-detect SSK USB NVMe dock. Export NVME=/dev/sdX and re-run." + echo "Auto-detected target disk: $NVME" +} + +preflight_cleanup() { + local img="$IMG_DIR/$IMG_FILE" + + # 1) Unmount image mountpoints and detach only loops for this IMG + umount -lf "$IMG_BOOT_MNT" "$IMG_ROOT_MNT" 2>/dev/null || true + # losetup -j exits non-zero if no association → tolerate it + { losetup -j "$img" | cut -d: -f1 | xargs -r losetup -d; } 2>/dev/null || true + + # 2) Unmount our target mounts + umount -lf "$TGT_ROOT/boot/firmware" "$TGT_BOOT" "$TGT_ROOT" 2>/dev/null || true + + # 3) Unmount the actual target partitions if mounted anywhere (tolerate 'not found') + for p in "$(part 1)" "$(part 2)"; do + # findmnt returns 1 when no match → capture and iterate if any + while read -r mnt; do + [ -n "$mnt" ] && umount -lf "$mnt" 2>/dev/null || true + done < <(findmnt -rno TARGET -S "$p" 2>/dev/null || true) + done + + # 4) Close dm-crypt mapping (if it exists) + cryptsetup luksClose cryptroot 2>/dev/null || true + dmsetup remove -f cryptroot 2>/dev/null || true + + # 5) Let udev settle + command -v udevadm >/dev/null && udevadm settle || true +} + +guard_target_device() { + # Refuse to operate if NVME appears to be the current system disk + local root_src root_disk + root_src=$(findmnt -no SOURCE /) + root_disk=$(lsblk -no pkname "$root_src" 2>/dev/null || true) + if [[ -n "$root_disk" && "/dev/$root_disk" == "$NVME" ]]; then + fatal "Refusing to operate on system disk ($NVME). Pick the external NVMe." + fi +} + +need_host_fido2() { + if ! command -v fido2-token >/dev/null 2>&1; then + echo "Host is missing fido2-token. On Arch: sudo pacman -S libfido2" + echo "On Debian/Ubuntu host: sudo apt-get install fido2-tools" + exit 1 + fi +} + +ensure_image() { + mkdir -p "$IMG_DIR" + chmod 755 "$IMG_DIR" + + local BASE="https://cdimage.ubuntu.com/releases/noble/release" + local XZ="ubuntu-24.04.3-preinstalled-${FLAVOR}-arm64+raspi.img.xz" + + # If the decompressed .img is missing, fetch/decompress into the cache. + if [[ ! -f "$IMG_DIR/$IMG_FILE" ]]; then + need curl; need unxz # Arch: pacman -S curl xz | Ubuntu: apt-get install curl xz-utils + if [[ ! -f "$IMG_DIR/$XZ" ]]; then + echo "Fetching image…" + curl -fL -o "$IMG_DIR/$XZ" "$BASE/$XZ" + fi + echo "Decompressing to $IMG_DIR/$IMG_FILE …" + # Keep the .xz for future runs; stream-decompress to the .img + if command -v unxz >/dev/null 2>&1; then + unxz -c "$IMG_DIR/$XZ" > "$IMG_DIR/$IMG_FILE" + else + need xz + xz -dc "$IMG_DIR/$XZ" > "$IMG_DIR/$IMG_FILE" + fi + sync + else + echo "Using cached image: $IMG_DIR/$IMG_FILE" + fi +} + +ensure_binfmt_aarch64(){ + # Register qemu-aarch64 for chrooted ARM64 apt runs + if [[ ! -e /proc/sys/fs/binfmt_misc/qemu-aarch64 ]]; then + need docker + systemctl enable --now docker >/dev/null 2>&1 || true + docker run --rm --privileged tonistiigi/binfmt --install arm64 >/dev/null + fi + if [[ ! -x /usr/local/bin/qemu-aarch64-static ]]; then + docker rm -f qemu-static >/dev/null 2>&1 || true + docker create --name qemu-static docker.io/multiarch/qemu-user-static:latest >/dev/null + docker cp qemu-static:/usr/bin/qemu-aarch64-static /usr/local/bin/ + install -D -m755 /usr/local/bin/qemu-aarch64-static /usr/local/bin/qemu-aarch64-static + docker rm qemu-static >/dev/null + fi +} + +open_image() { + [[ -r "$IMG_DIR/$IMG_FILE" ]] || fatal "Image not found: $IMG_DIR/$IMG_FILE" + mkdir -p "$IMG_BOOT_MNT" "$IMG_ROOT_MNT" + + # Pre-clean: detach any previous loop(s) for this image (tolerate absence) + umount -lf "$IMG_BOOT_MNT" 2>/dev/null || true + umount -lf "$IMG_ROOT_MNT" 2>/dev/null || true + # If no loop is attached, losetup -j returns non-zero → swallow it + mapfile -t OLD < <({ losetup -j "$IMG_DIR/$IMG_FILE" | cut -d: -f1; } 2>/dev/null || true) + for L in "${OLD[@]:-}"; do losetup -d "$L" 2>/dev/null || true; done + command -v udevadm >/dev/null && udevadm settle || true + + # Attach with partition scan; wait for partition nodes to exist + LOOP=$(losetup --find --show --partscan "$IMG_DIR/$IMG_FILE") || fatal "losetup failed" + command -v udevadm >/dev/null && udevadm settle || true + for _ in {1..25}; do + [[ -b "${LOOP}p1" && -b "${LOOP}p2" ]] && break + sleep 0.1 + command -v udevadm >/dev/null && udevadm settle || true + done + [[ -b "${LOOP}p1" ]] || fatal "loop partitions not present for $LOOP" + + # Cleanup on exit: unmount first, then detach loop (tolerate absence) + trap 'umount -lf "'"$IMG_BOOT_MNT"'" "'"$IMG_ROOT_MNT"'" 2>/dev/null; losetup -d "'"$LOOP"'" 2>/dev/null' EXIT + + # Mount image partitions read-only + mount -o ro "${LOOP}p1" "$IMG_BOOT_MNT" + mount -o ro "${LOOP}p2" "$IMG_ROOT_MNT" + + # Sanity checks without using failing pipelines + # start*.elf must exist + if ! compgen -G "$IMG_BOOT_MNT/start*.elf" > /dev/null; then + fatal "start*.elf not found in image" + fi + # vmlinuz-* must exist + if ! compgen -G "$IMG_ROOT_MNT/boot/vmlinuz-*" > /dev/null; then + fatal "vmlinuz-* not found in image root" + fi +} + +confirm_and_wipe(){ + lsblk -o NAME,SIZE,MODEL,TRAN,LABEL "$NVME" + read -rp "Type EXACTLY 'WIPE' to destroy ALL DATA on $NVME: " ACK + [[ "$ACK" == "WIPE" ]] || fatal "Aborted" + wipefs -a "$NVME" + sgdisk -Zo "$NVME" + # GPT: 1: 1MiB..513MiB vfat ESP; 2: rest LUKS + parted -s "$NVME" mklabel gpt \ + mkpart system-boot fat32 1MiB 513MiB set 1 esp on \ + mkpart cryptroot 513MiB 100% + partprobe "$NVME"; sleep 1 + mkfs.vfat -F32 -n system-boot "$(part 1)" +} + +setup_luks(){ + echo "Create LUKS2 on $(part 2) (you will be prompted for a passphrase; keep it as fallback)" + need cryptsetup + cryptsetup luksFormat --type luks2 "$(part 2)" + cryptsetup open "$(part 2)" cryptroot + mkfs.ext4 -L rootfs /dev/mapper/cryptroot +} + +mount_targets(){ + mkdir -p "$TGT_ROOT" "$TGT_BOOT" + mount /dev/mapper/cryptroot "$TGT_ROOT" + mkdir -p "$TGT_ROOT/boot/firmware" + mount "$(part 1)" "$TGT_BOOT" + mount --bind "$TGT_BOOT" "$TGT_ROOT/boot/firmware" +} + +rsync_root_and_boot(){ + need rsync + rsync -aAXH --numeric-ids --delete \ + --exclude='/boot/firmware' --exclude='/boot/firmware/**' \ + --exclude='/dev/*' --exclude='/proc/*' --exclude='/sys/*' \ + --exclude='/run/*' --exclude='/tmp/*' --exclude='/mnt/*' \ + --exclude='/media/*' --exclude='/lost+found' \ + "$IMG_ROOT_MNT"/ "$TGT_ROOT"/ + rsync -aH --delete "$IMG_BOOT_MNT"/ "$TGT_ROOT/boot/firmware"/ +} + +write_crypttab_fstab(){ + LUUID=$(blkid -s UUID -o value "$(part 2)") + printf 'cryptroot UUID=%s none luks,discard,fido2-device=auto\n' "$LUUID" > "$TGT_ROOT/etc/crypttab" + cat > "$TGT_ROOT/etc/fstab" <> "$C" + grep -q '^cmdline=cmdline.txt' "$C" || sed -i '1i cmdline=cmdline.txt' "$C" + + # Display & buses (Pi 5) + grep -q '^dtoverlay=vc4-kms-v3d-pi5' "$C" || echo 'dtoverlay=vc4-kms-v3d-pi5' >> "$C" + grep -q '^dtparam=i2c_arm=on' "$C" || echo 'dtparam=i2c_arm=on' >> "$C" + grep -q '^dtparam=pciex1=on' "$C" || echo 'dtparam=pciex1=on' >> "$C" + grep -q '^dtparam=pciex1_gen=2' "$C" || echo 'dtparam=pciex1_gen=2' >> "$C" + grep -q '^enable_uart=1' "$C" || echo 'enable_uart=1' >> "$C" + + # Minimal, correct dracut hints using the bare UUID + local LUUID; LUUID=$(blkid -s UUID -o value "$(part 2)") + : > "$CL" + { + echo -n "rd.luks.uuid=$LUUID rd.luks.name=$LUUID=cryptroot " + echo -n "root=/dev/mapper/cryptroot rootfstype=ext4 rootwait fixrtc " + echo "console=serial0,115200 console=tty1 ds=nocloud;s=file:///boot/firmware/ ${DSI_FLAGS} rd.debug" + } >> "$CL" +} + +seed_cloud_init(){ + # NoCloud seed to create user, lock down SSH, set hostname, and enable avahi. + cat > "$TGT_ROOT/boot/firmware/user-data" < "$TGT_ROOT/boot/firmware/meta-data" +} + +prep_chroot_mounts(){ + for d in dev proc sys; do mount --bind "/$d" "$TGT_ROOT/$d"; done + mount -t devpts devpts "$TGT_ROOT/dev/pts" + # Replace the usual resolv.conf symlink with a real file for apt to work + rm -f "$TGT_ROOT/etc/resolv.conf" + cp /etc/resolv.conf "$TGT_ROOT/etc/resolv.conf" + + # Block service starts (no systemd in chroot) + cat > "$TGT_ROOT/usr/sbin/policy-rc.d" <<'EOP' +#!/bin/sh +exit 101 +EOP + chmod +x "$TGT_ROOT/usr/sbin/policy-rc.d" + + # Ensure qemu static is present inside chroot + install -D -m755 /usr/local/bin/qemu-aarch64-static "$TGT_ROOT/usr/bin/qemu-aarch64-static" +} + +in_chroot(){ + chroot "$TGT_ROOT" /usr/bin/qemu-aarch64-static /bin/bash -lc ' +set -euo pipefail +export DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC + +# --- APT sources (ports) --- +cat > /etc/apt/sources.list <<'"'"'EOS'"'"' +deb http://ports.ubuntu.com/ubuntu-ports noble main restricted universe multiverse +deb http://ports.ubuntu.com/ubuntu-ports noble-updates main restricted universe multiverse +deb http://ports.ubuntu.com/ubuntu-ports noble-security main restricted universe multiverse +EOS + +apt-get update + +# --- Remove snaps and pin them off --- +apt-get -y purge snapd || true +rm -rf /snap /var/snap /var/lib/snapd /home/*/snap || true +mkdir -p /etc/apt/preferences.d +cat > /etc/apt/preferences.d/nosnap.pref <<'"'"'EOS'"'"' +Package: snapd +Pin: release * +Pin-Priority: -10 +EOS + +# --- Base tools (no flash-kernel; we use dracut) --- +apt-get install -y --no-install-recommends \ + openssh-client openssh-server openssh-sftp-server avahi-daemon \ + cryptsetup dracut fido2-tools libfido2-1 i2c-tools \ + python3-smbus python3-pil zbar-tools qrencode lm-sensors \ + file zstd lz4 || true + +# Camera apps: try rpicam-apps; otherwise basic libcamera tools +apt-get install -y rpicam-apps || apt-get install -y libcamera-tools || true + +# --- Persistent journal so we can read logs after failed boot --- +mkdir -p /etc/systemd/journald.conf.d +cat > /etc/systemd/journald.conf.d/99-persistent.conf <<'"'"'EOS'"'"' +[Journal] +Storage=persistent +EOS + +# --- SSH hardening (ensure file exists even if package was half-installed) --- +if [ ! -f /etc/ssh/sshd_config ]; then + mkdir -p /etc/ssh + cat > /etc/ssh/sshd_config <<'"'"'EOS'"'"' +PermitRootLogin no +PasswordAuthentication no +KbdInteractiveAuthentication no +PubkeyAuthentication yes +# Accept defaults for the rest +EOS +fi +sed -i -e "s/^#\?PasswordAuthentication .*/PasswordAuthentication no/" \ + -e "s/^#\?KbdInteractiveAuthentication .*/KbdInteractiveAuthentication no/" \ + -e "s/^#\?PermitRootLogin .*/PermitRootLogin no/" \ + -e "s/^#\?PubkeyAuthentication .*/PubkeyAuthentication yes/" /etc/ssh/sshd_config || true + +# --- Hostname & hosts --- +echo "'"$STYX_HOSTNAME"'" > /etc/hostname +if grep -q "^127\\.0\\.1\\.1" /etc/hosts; then + sed -i "s/^127\\.0\\.1\\.1.*/127.0.1.1\t'"$STYX_HOSTNAME"'/" /etc/hosts +else + echo -e "127.0.1.1\t'"$STYX_HOSTNAME"'" >> /etc/hosts +fi + +# --- Enable services on first boot --- +mkdir -p /etc/systemd/system/multi-user.target.wants +ln -sf /lib/systemd/system/ssh.service /etc/systemd/system/multi-user.target.wants/ssh.service +ln -sf /lib/systemd/system/avahi-daemon.service /etc/systemd/system/multi-user.target.wants/avahi-daemon.service || true + +# --- Ensure i2c group --- +getent group i2c >/dev/null || groupadd i2c + +# --- Dracut configuration (generic, not host-only) --- +mkdir -p /etc/dracut.conf.d +cat > /etc/dracut.conf.d/00-hostonly.conf <<'"'"'EOS'"'"' +hostonly=no +EOS +cat > /etc/dracut.conf.d/10-systemd-crypt.conf <<'"'"'EOS'"'"' +add_dracutmodules+=" systemd crypt " +EOS +cat > /etc/dracut.conf.d/20-drivers.conf <<'"'"'EOS'"'"' +add_drivers+=" nvme xhci_pci xhci_hcd usbhid hid_generic hid " +EOS +cat > /etc/dracut.conf.d/30-fido2.conf <<'"'"'EOS'"'"' +install_items+="/usr/bin/systemd-cryptsetup /usr/bin/fido2-token /usr/lib/*/libfido2.so* /usr/lib/*/libcbor.so*" +EOS + +# --- Build initramfs and place it where firmware expects it --- +KVER=$(ls -1 /lib/modules | sort -V | tail -n1) +dracut --force /boot/initramfs-$KVER.img $KVER +ln -sf initramfs-$KVER.img /boot/initrd.img +ln -sf initramfs-$KVER.img /boot/initrd.img-$KVER +cp -a /boot/initramfs-$KVER.img /boot/firmware/initrd.img + +# --- Create uncompressed kernel for Pi 5 firmware --- +if [ -f "/usr/lib/linux-image-$KVER/Image" ]; then + cp -a "/usr/lib/linux-image-$KVER/Image" /boot/firmware/kernel_2712.img +else + FMT=$(file -b "/boot/vmlinuz-$KVER" || true) + case "$FMT" in + *Zstandard*|*zstd*) zstd -dc "/boot/vmlinuz-$KVER" > /boot/firmware/kernel_2712.img ;; + *LZ4*) lz4 -dc "/boot/vmlinuz-$KVER" > /boot/firmware/kernel_2712.img ;; + *gzip*) zcat "/boot/vmlinuz-$KVER" > /boot/firmware/kernel_2712.img ;; + *) cp -a "/boot/vmlinuz-$KVER" /boot/firmware/kernel_2712.img ;; + esac +fi + +# --- Ensure Pi 5 DTB is present on the boot partition --- +DTB=$(find /lib/firmware -type f -name "bcm2712-rpi-5-b.dtb" | sort | tail -n1 || true) +[ -n "$DTB" ] && cp -a "$DTB" /boot/firmware/ + +# --- Dracut hook to copy rdsosreport.txt to the FAT partition on failure --- +mkdir -p /usr/lib/dracut/modules.d/99copylog +cat > /usr/lib/dracut/modules.d/99copylog/module-setup.sh <<'"'"'EOS'"'"' +#!/bin/bash +check() { return 0; } +depends() { echo base; return 0; } +install() { + # Guard $moddir for nounset; derive if absent + local mdir="${moddir:-$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)}" + inst_hook emergency 99 "$mdir/copylog.sh" +} +EOS +chmod +x /usr/lib/dracut/modules.d/99copylog/module-setup.sh + +cat > /usr/lib/dracut/modules.d/99copylog/copylog.sh <<'"'"'EOS'"'"' +#!/bin/sh +set -e +for dev in /dev/nvme0n1p1 /dev/sda1 /dev/sdb1 /dev/mmcblk0p1; do + [ -b "$dev" ] || continue + mkdir -p /mnt/bootfat + if mount -t vfat "$dev" /mnt/bootfat 2>/dev/null; then + if [ -s /run/initramfs/rdsosreport.txt ]; then + cp -f /run/initramfs/rdsosreport.txt /mnt/bootfat/rdsosreport.txt 2>/dev/null || true + sync || true + fi + umount /mnt/bootfat || true + break + fi +done +EOS +chmod +x /usr/lib/dracut/modules.d/99copylog/copylog.sh + +# Rebuild to ensure the copylog module is included +dracut --force /boot/initramfs-$KVER.img $KVER +ln -sf initramfs-$KVER.img /boot/initrd.img +cp -a /boot/initramfs-$KVER.img /boot/firmware/initrd.img + +true +' +} + +verify_boot_assets(){ + echo "---- verify boot assets on FAT ----" + file "$TGT_ROOT/boot/firmware/kernel_2712.img" || true + ls -lh "$TGT_ROOT/boot/firmware/initrd.img" || true + echo "-- config.txt (key lines) --" + grep -E '^(kernel|initramfs|cmdline)=|^dtoverlay=|^dtparam=' "$TGT_ROOT/boot/firmware/config.txt" || true + echo "-- cmdline.txt --" + cat "$TGT_ROOT/boot/firmware/cmdline.txt" || true + echo "-- firmware blobs (sample) --" + ls -1 "$TGT_ROOT/boot/firmware"/start*.elf "$TGT_ROOT/boot/firmware"/fixup*.dat | head -n 8 || true + echo "-- Pi5 DTB --" + ls -l "$TGT_ROOT/boot/firmware/"*rpi-5-b.dtb || true +} + +enroll_fido_tokens(){ + echo "Enrolling FIDO2 Solo keys into $(part 2) ..." + need systemd-cryptenroll + need fido2-token + + # Collect all hidraw paths from both output styles (some distros print 'Device: /dev/hidrawX') + mapfile -t DEVS < <( + fido2-token -L \ + | sed -n 's,^\(/dev/hidraw[0-9]\+\):.*,\1,p; s,^Device:[[:space:]]\+/dev/hidraw\([0-9]\+\).*,/dev/hidraw\1,p' \ + | sort -u + ) + + if (( ${#DEVS[@]} == 0 )); then + echo "No FIDO2 tokens detected; skipping enrollment (you can enroll later)." + echo "Example later: systemd-cryptenroll $(part 2) --fido2-device=/dev/hidrawX --fido2-with-client-pin=no" + return 0 + fi + + # Recommend keeping exactly ONE key plugged during first enrollment to avoid ambiguity. + if (( ${#DEVS[@]} > 1 )); then + echo "Note: multiple FIDO2 tokens present: ${DEVS[*]}" + echo "If enrollment fails, try with only one key inserted." + fi + + local rc=0 + for D in "${DEVS[@]}"; do + echo "-> Enrolling $D (you should be asked to touch the key)" + if ! SYSTEMD_LOG_LEVEL=debug systemd-cryptenroll "$(part 2)" \ + --fido2-device="$D" \ + --fido2-with-client-pin=no \ + --fido2-with-user-presence=yes \ + --fido2-with-user-verification=no \ + --label="solo-$(basename "$D")"; then + echo "WARN: enrollment failed for $D" + rc=1 + fi + done + + echo "Tokens enrolled (if any):" + systemd-cryptenroll "$(part 2)" --list || true + return $rc +} + +cleanup(){ + rm -f "$TGT_ROOT/usr/sbin/policy-rc.d" || true + umount -lf "$TGT_ROOT/dev/pts" 2>/dev/null || true + for d in dev proc sys; do umount -lf "$TGT_ROOT/$d" 2>/dev/null || true; done + umount -lf "$TGT_ROOT/boot/firmware" 2>/dev/null || true + umount -lf "$TGT_BOOT" 2>/dev/null || true + umount -lf "$TGT_ROOT" 2>/dev/null || true + cryptsetup close cryptroot 2>/dev/null || true + umount -lf "$IMG_BOOT_MNT" 2>/dev/null || true + umount -lf "$IMG_ROOT_MNT" 2>/dev/null || true +} + +main(){ + require_root + need losetup; need parted; need rsync + auto_detect_target_disk + echo "Target disk: $NVME" + ensure_binfmt_aarch64 + ensure_image + preflight_cleanup + guard_target_device + open_image + confirm_and_wipe + setup_luks + mount_targets + rsync_root_and_boot + write_crypttab_fstab + fix_firmware_files + seed_cloud_init + prep_chroot_mounts + in_chroot + verify_boot_assets + need_host_fido2 + enroll_fido_tokens + cleanup + echo "✅ NVMe prepared." + echo " Install in the Pi 5 and boot with no SD." + echo " Expect LUKS to unlock automatically with a Solo key inserted;" + echo " passphrase fallback remains. Hostname: ${STYX_HOSTNAME} User: ${STYX_USER}" + echo " On first boot, reach it via: ssh -i ~/.ssh/id_ed25519_titan styx@titan-ag.local" +} + +main "$@" diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index b176c64..dc62ef5 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -71,8 +71,8 @@ spec: persistentVolume: enabled: true - size: 100Gi # adjust; uses default StorageClass (Longhorn) - # storageClassName: "" # set if you want a specific class + size: 100Gi + storageClassName: "astreae" # Enable built-in Kubernetes scraping scrape: @@ -245,8 +245,8 @@ spec: GF_SECURITY_ALLOW_EMBEDDING: "true" grafana.ini: server: - domain: reporting.bstein.dev - root_url: https://reporting.bstein.dev/ + domain: atlas.metrics.bstein.dev + root_url: https://atlas.metrics.bstein.dev/ auth.anonymous: hide_version: true users: @@ -322,14 +322,14 @@ spec: annotations: cert-manager.io/cluster-issuer: letsencrypt hosts: - - host: alerts.bstein.dev + - host: atlas.alerts.bstein.dev paths: - path: / pathType: Prefix tls: - secretName: alerts-bstein-dev-tls hosts: - - alerts.bstein.dev + - atlas.alerts.bstein.dev config: global: resolve_timeout: 5m From d0b6fbe763b4c2489ebd44d370aeee05d8cc49fa Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 15 Nov 2025 11:16:37 -0300 Subject: [PATCH 07/71] victoria-metrics: revert storageclass change --- services/monitoring/helmrelease.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index dc62ef5..3e5c78c 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -72,7 +72,6 @@ spec: persistentVolume: enabled: true size: 100Gi - storageClassName: "astreae" # Enable built-in Kubernetes scraping scrape: From 683dc84289ad0f8b687ead0a1cf5ff8dbe8e11ca Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 15 Nov 2025 11:18:40 -0300 Subject: [PATCH 08/71] grafana: use atlas metrics hostname --- services/monitoring/helmrelease.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 3e5c78c..1720af5 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -256,12 +256,12 @@ spec: annotations: cert-manager.io/cluster-issuer: letsencrypt hosts: - - reporting.bstein.dev + - atlas.metrics.bstein.dev path: / tls: - - secretName: grafana-reporting-tls + - secretName: grafana-atlas-metrics-tls hosts: - - reporting.bstein.dev + - atlas.metrics.bstein.dev datasources: datasources.yaml: apiVersion: 1 From 46b6b1f3b896eb8cd46d0ae8e33d5f00dc30ceae Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 15 Nov 2025 11:35:27 -0300 Subject: [PATCH 09/71] grafana: set datasource uid --- services/monitoring/helmrelease.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 1720af5..266ddcd 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -273,6 +273,7 @@ spec: isDefault: true jsonData: timeInterval: "15s" + uid: atlas-vm dashboardProviders: dashboardproviders.yaml: apiVersion: 1 From eb3991b6283bfb606f094778fec437f0daef6203 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 15 Nov 2025 11:59:48 -0300 Subject: [PATCH 10/71] dashboards: improve public view and fix color --- .../monitoring/grafana-dashboard-public.yaml | 115 +++++++++++++++--- .../monitoring/grafana-dashboard-sre.yaml | 2 +- 2 files changed, 100 insertions(+), 17 deletions(-) diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-public.yaml index db5d6c1..aee871f 100644 --- a/services/monitoring/grafana-dashboard-public.yaml +++ b/services/monitoring/grafana-dashboard-public.yaml @@ -177,31 +177,114 @@ data: "y": 7 }, "id": 3, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, "targets": [ { "datasource": { "type": "prometheus", "uid": "atlas-vm" }, - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace!=\"\", container!=\"\"}[5m])) by (namespace)", + "expr": "sum(kube_pod_status_phase{phase=\"Running\"}) by (namespace)", "legendFormat": "{{namespace}}", "refId": "A" } ], - "title": "Namespace CPU (5m avg)", - "type": "timeseries" + "title": "Running pods per namespace", + "type": "bargauge", + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "showUnfilled": false + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 4, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "legendFormat": "Ready", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"false\"})", + "legendFormat": "Not Ready", + "refId": "B" + } + ], + "title": "Node readiness", + "type": "piechart", + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "pieType": "donut" + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 5, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "expr": "sum by (namespace) (increase(kube_pod_status_phase{phase=\"Failed\"}[1d]))", + "legendFormat": "{{namespace}}", + "refId": "A" + } + ], + "title": "Failed pods (24h)", + "type": "table", + "fieldConfig": { + "defaults": { + "unit": "none", + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "red", "value": 1} + ] + } + }, + "overrides": [] + }, + "options": { + "showHeader": true + } } ], "refresh": "30s", @@ -215,7 +298,7 @@ data: "list": [] }, "time": { - "from": "now-6h", + "from": "now-12h", "to": "now" }, "timepicker": {}, diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-sre.yaml index 12995af..d146275 100644 --- a/services/monitoring/grafana-dashboard-sre.yaml +++ b/services/monitoring/grafana-dashboard-sre.yaml @@ -38,7 +38,7 @@ data: "fieldConfig": { "defaults": { "color": { - "mode": "continuous" + "mode": "continuous-RdYlGr" }, "mappings": [], "max": 100, From 0b1437b77c93c5a0ac411983d080938e3030af8c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 15 Nov 2025 21:03:11 -0300 Subject: [PATCH 11/71] monitoring: refresh grafana dashboards --- .../monitoring/grafana-dashboard-public.yaml | 545 ++++++++++++++---- .../monitoring/grafana-dashboard-sre.yaml | 527 ++++++++++++++--- services/monitoring/grafana-folders.yaml | 28 + services/monitoring/helmrelease.yaml | 5 + services/monitoring/kustomization.yaml | 1 + 5 files changed, 903 insertions(+), 203 deletions(-) create mode 100644 services/monitoring/grafana-folders.yaml diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-public.yaml index aee871f..126b1b3 100644 --- a/services/monitoring/grafana-dashboard-public.yaml +++ b/services/monitoring/grafana-dashboard-public.yaml @@ -25,17 +25,30 @@ data: ] }, "editable": false, - "fiscalYearStartMonth": 0, + "folderUid": "atlas-public", "graphTooltip": 0, - "id": null, "links": [], - "liveNow": false, "panels": [ { + "id": 1, + "type": "stat", + "title": "Running pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", + "refId": "A" + } + ], "fieldConfig": { "defaults": { "color": { @@ -46,8 +59,12 @@ data: "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null + }, + { + "color": "green", + "value": 1 } ] }, @@ -55,59 +72,105 @@ data: }, "overrides": [] }, - "gridPos": { - "h": 7, - "w": 6, - "x": 0, - "y": 0 - }, - "id": 1, "options": { "colorMode": "value", "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "10.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "editorMode": "code", - "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", - "legendFormat": "", - "range": true, - "refId": "A" } - ], - "title": "Running pods", - "type": "stat" + } }, { + "id": 2, + "type": "stat", + "title": "Ready node percentage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, - "description": "Aggregated CPU usage across all schedulable nodes.", + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100", + "refId": "A" + } + ], "fieldConfig": { "defaults": { "color": { - "mode": "continuous-BlYlRd" + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "green", + "value": 98 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 3, + "type": "stat", + "title": "Cluster CPU saturation", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "avg((1 - rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, "mappings": [], - "max": 100, - "min": 0, "thresholds": { "mode": "percentage", "steps": [ @@ -117,7 +180,7 @@ data: }, { "color": "yellow", - "value": 60 + "value": 65 }, { "color": "red", @@ -129,79 +192,165 @@ data: }, "overrides": [] }, - "gridPos": { - "h": 7, - "w": 6, - "x": 6, - "y": 0 - }, - "id": 2, "options": { "colorMode": "value", "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false - }, - "text": {}, - "textMode": "auto" - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "avg(100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100))", - "legendFormat": "", - "refId": "A" } - ], - "title": "Average node CPU", - "type": "stat" + } }, { + "id": 4, + "type": "stat", + "title": "Cluster memory usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 7 + "h": 6, + "w": 6, + "x": 18, + "y": 0 }, - "id": 3, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "sum(kube_pod_status_phase{phase=\"Running\"}) by (namespace)", - "legendFormat": "{{namespace}}", + "expr": "100 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes) * 100)", "refId": "A" } ], - "title": "Running pods per namespace", - "type": "bargauge", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, "options": { - "displayMode": "gradient", - "orientation": "horizontal", + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false - }, - "showUnfilled": false + } } }, { + "id": 5, + "type": "piechart", + "title": "Namespace CPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 6 + }, + "targets": [ + { + "expr": "topk(8, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "cores" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 6, + "type": "piechart", + "title": "Namespace memory share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 6 + }, + "targets": [ + { + "expr": "topk(8, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 7, + "type": "timeseries", + "title": "Node CPU usage (per node)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -212,38 +361,70 @@ data: "x": 0, "y": 15 }, - "id": 4, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", - "legendFormat": "Ready", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"false\"})", - "legendFormat": "Not Ready", - "refId": "B" + "expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100", + "refId": "A", + "legendFormat": "{{instance}}" } ], - "title": "Node readiness", - "type": "piechart", + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, "options": { "legend": { "displayMode": "table", - "placement": "right" + "placement": "bottom" }, - "pieType": "donut" + "tooltip": { + "mode": "multi" + } } }, { + "id": 8, + "type": "timeseries", + "title": "Node memory usage (per node)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "targets": [ + { + "expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 9, + "type": "table", + "title": "Key service availability", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -254,29 +435,39 @@ data: "x": 0, "y": 23 }, - "id": 5, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "sum by (namespace) (increase(kube_pod_status_phase{phase=\"Failed\"}[1d]))", - "legendFormat": "{{namespace}}", + "expr": "max by (service) (up{service=~\"traefik|gitea|vault|victoria-metrics-single|grafana|alertmanager\"})", "refId": "A" } ], - "title": "Failed pods (24h)", - "type": "table", "fieldConfig": { "defaults": { - "unit": "none", - "mappings": [], + "mappings": [ + { + "id": 0, + "type": 1, + "value": "0", + "text": "Down" + }, + { + "id": 1, + "type": 1, + "value": "1", + "text": "Up" + } + ], "thresholds": { "mode": "absolute", "steps": [ - {"color": "green", "value": null}, - {"color": "red", "value": 1} + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } ] } }, @@ -285,6 +476,126 @@ data: "options": { "showHeader": true } + }, + { + "id": 10, + "type": "table", + "title": "Failed pods (24h trend)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 23 + }, + "targets": [ + { + "expr": "topk(10, sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h])) by (namespace))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "showHeader": true + } + }, + { + "id": 11, + "type": "timeseries", + "title": "Cluster network throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 31 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\"}[5m]))", + "refId": "A", + "legendFormat": "Receive" + }, + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]))", + "refId": "B", + "legendFormat": "Transmit" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 12, + "type": "timeseries", + "title": "Storage usage across nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 31 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + } + } + }, + { + "id": 13, + "type": "text", + "title": "About this dashboard", + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 39 + }, + "options": { + "content": "### Atlas at a glance\n- Raspberry Pi + Jetson hybrid cluster with Flux-managed GitOps\n- Metrics powered by VictoriaMetrics, visualized by Grafana\n- Login for SRE mode with pod-level drilldowns, alert routes, and storage health", + "mode": "markdown" + } } ], "refresh": "30s", @@ -301,10 +612,8 @@ data: "from": "now-12h", "to": "now" }, - "timepicker": {}, - "timezone": "", "title": "Atlas Public Overview", "uid": "atlas-public", - "version": 1, - "weekStart": "" + "version": 3 } + diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-sre.yaml index d146275..b46c17a 100644 --- a/services/monitoring/grafana-dashboard-sre.yaml +++ b/services/monitoring/grafana-dashboard-sre.yaml @@ -20,29 +20,41 @@ data: "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", - "type": "dashboard" + "type": "dashboard" } ] }, "editable": true, - "fiscalYearStartMonth": 0, + "folderUid": "atlas-sre", "graphTooltip": 0, "links": [], "panels": [ { + "id": 1, + "type": "stat", + "title": "Ready nodes", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, - "description": "Percentage of Ready nodes.", + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100", + "refId": "A" + } + ], "fieldConfig": { "defaults": { "color": { - "mode": "continuous-RdYlGr" + "mode": "palette-classic" }, "mappings": [], - "max": 100, - "min": 0, "thresholds": { "mode": "percentage", "steps": [ @@ -50,9 +62,13 @@ data: "color": "red", "value": null }, + { + "color": "yellow", + "value": 95 + }, { "color": "green", - "value": 90 + "value": 99 } ] }, @@ -60,18 +76,10 @@ data: }, "overrides": [] }, - "gridPos": { - "h": 7, - "w": 6, - "x": 0, - "y": 0 - }, - "id": 10, "options": { "colorMode": "value", - "graphMode": "none", + "graphMode": "area", "justifyMode": "center", - "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" @@ -79,92 +87,192 @@ data: "fields": "", "values": false } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "avg(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) * 100", - "refId": "A" - } - ], - "title": "Ready nodes", - "type": "stat" + } }, { + "id": 2, + "type": "stat", + "title": "Pending pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 5, "w": 6, "x": 6, "y": 0 }, - "id": 11, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "sum by (node)(node_filesystem_avail_bytes{mountpoint=\"/\"})", - "legendFormat": "{{node}}", + "expr": "sum(kube_pod_status_phase{phase=\"Pending\"})", "refId": "A" } ], - "title": "Free root filesystem bytes", - "type": "timeseries" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 3 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } }, { + "id": 3, + "type": "stat", + "title": "Unavailable deployment replicas", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 7 - }, - "id": 12, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } + "h": 5, + "w": 6, + "x": 12, + "y": 0 }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"crypto\",container!=\"\"}[5m])) by (pod)", - "legendFormat": "{{pod}}", + "expr": "sum(kube_deployment_status_replicas_unavailable)", "refId": "A" } ], - "title": "Crypto namespace CPU usage", - "type": "timeseries" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } }, { + "id": 4, + "type": "stat", + "title": "Active alerts", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"firing\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 5, + "type": "timeseries", + "title": "Node CPU usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -173,9 +281,168 @@ data: "h": 9, "w": 12, "x": 0, - "y": 17 + "y": 5 + }, + "targets": [ + { + "expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 6, + "type": "timeseries", + "title": "Node memory usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "targets": [ + { + "expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)", + "refId": "A", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 7, + "type": "timeseries", + "title": "Top pod CPU (5m avg)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "topk(5, sum(rate(container_cpu_usage_seconds_total{pod!=\"\",container!=\"\"}[5m])) by (namespace,pod))", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "cores" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 8, + "type": "timeseries", + "title": "Top pod memory working set", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 14 + }, + "targets": [ + { + "expr": "topk(5, sum(container_memory_working_set_bytes{pod!=\"\",container!=\"\"}) by (namespace,pod))", + "refId": "A", + "legendFormat": "{{namespace}}/{{pod}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 9, + "type": "bargauge", + "title": "Namespace restart rate (6h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "targets": [ + { + "expr": "topk(8, sum(increase(kube_pod_container_status_restarts_total{namespace!=\"\"}[6h])) by (namespace))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] }, - "id": 13, "options": { "displayMode": "gradient", "orientation": "horizontal", @@ -185,22 +452,112 @@ data: ], "fields": "", "values": false - }, - "showUnfilled": false + } + } + }, + { + "id": 10, + "type": "table", + "title": "Deployments missing replicas", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 23 }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "expr": "count(sum(kube_pod_status_phase{phase=\"Failed\"}) by (namespace))", - "legendFormat": "", + "expr": "topk(10, sum by (namespace,deployment) (kube_deployment_status_replicas_unavailable))", "refId": "A" } ], - "title": "Namespaces with failed pods", - "type": "bargauge" + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "showHeader": true + } + }, + { + "id": 11, + "type": "timeseries", + "title": "Pod phase breakdown", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 31 + }, + "targets": [ + { + "expr": "sum(kube_pod_status_phase) by (phase)", + "refId": "A", + "legendFormat": "{{phase}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 12, + "type": "timeseries", + "title": "PVC usage (top 8)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 31 + }, + "targets": [ + { + "expr": "topk(8, sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))", + "refId": "A", + "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } } ], "schemaVersion": 39, @@ -216,8 +573,8 @@ data: "from": "now-12h", "to": "now" }, - "timepicker": {}, "title": "Atlas SRE Overview", "uid": "atlas-sre", - "version": 1 + "version": 2 } + diff --git a/services/monitoring/grafana-folders.yaml b/services/monitoring/grafana-folders.yaml new file mode 100644 index 0000000..503aaee --- /dev/null +++ b/services/monitoring/grafana-folders.yaml @@ -0,0 +1,28 @@ +# services/monitoring/grafana-folders.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-folders + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/component: folders +data: + folders.yaml: | + apiVersion: 1 + folders: + - uid: atlas-public + title: Atlas Public + permissions: + - role: Viewer + permission: View + - role: Editor + permission: Edit + - role: Admin + permission: Admin + - uid: atlas-sre + title: Atlas SRE + permissions: + - role: Editor + permission: View + - role: Admin + permission: Admin diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 266ddcd..4efae70 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -297,6 +297,11 @@ spec: dashboardsConfigMaps: public: grafana-dashboard-public sre: grafana-dashboard-sre + extraConfigmapMounts: + - name: grafana-folders + mountPath: /etc/grafana/provisioning/folders + configMap: grafana-folders + readOnly: true --- diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index bb321b5..73e7d23 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -7,4 +7,5 @@ resources: - rbac.yaml - grafana-dashboard-public.yaml - grafana-dashboard-sre.yaml + - grafana-folders.yaml - helmrelease.yaml From b004bf99dc88294f15ec38b66f39b7f01a2435c0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sun, 16 Nov 2025 00:55:28 -0300 Subject: [PATCH 12/71] monitoring: enrich dashboards --- .../monitoring/grafana-dashboard-public.yaml | 648 +++++++++++++++--- .../monitoring/grafana-dashboard-sre.yaml | 25 +- 2 files changed, 551 insertions(+), 122 deletions(-) diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-public.yaml index 126b1b3..35fa124 100644 --- a/services/monitoring/grafana-dashboard-public.yaml +++ b/services/monitoring/grafana-dashboard-public.yaml @@ -38,8 +38,8 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 6, - "w": 6, + "h": 5, + "w": 4, "x": 0, "y": 0 }, @@ -82,26 +82,27 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { "id": 2, "type": "stat", - "title": "Ready node percentage", + "title": "Ready nodes", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, - "w": 6, - "x": 6, + "h": 5, + "w": 4, + "x": 4, "y": 0 }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", "refId": "A" } ], @@ -112,23 +113,19 @@ data: }, "mappings": [], "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { - "color": "red", + "color": "rgba(115, 115, 115, 1)", "value": null }, - { - "color": "orange", - "value": 90 - }, { "color": "green", - "value": 98 + "value": 1 } ] }, - "unit": "percent" + "unit": "none" }, "overrides": [] }, @@ -142,26 +139,27 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { "id": 3, "type": "stat", - "title": "Cluster CPU saturation", + "title": "Cluster nodes", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, - "w": 6, - "x": 12, + "h": 5, + "w": 4, + "x": 8, "y": 0 }, "targets": [ { - "expr": "avg((1 - rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "expr": "count(kube_node_info)", "refId": "A" } ], @@ -172,23 +170,19 @@ data: }, "mappings": [], "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", - "value": 65 - }, - { - "color": "red", - "value": 85 + "color": "green", + "value": 1 } ] }, - "unit": "percent" + "unit": "none" }, "overrides": [] }, @@ -202,26 +196,27 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { "id": 4, "type": "stat", - "title": "Cluster memory usage", + "title": "Hottest node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 6, - "w": 6, - "x": 18, + "h": 5, + "w": 4, + "x": 12, "y": 0 }, "targets": [ { - "expr": "100 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes) * 100)", + "expr": "topk(1, avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)))", "refId": "A" } ], @@ -262,11 +257,134 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value_and_name" } }, { "id": 5, + "type": "stat", + "title": "Hottest node memory", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 0 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 75 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 6, + "type": "stat", + "title": "Failed pods (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "targets": [ + { + "expr": "sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, "type": "piechart", "title": "Namespace CPU share", "datasource": { @@ -277,11 +395,11 @@ data: "h": 9, "w": 12, "x": 0, - "y": 6 + "y": 5 }, "targets": [ { - "expr": "topk(8, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", + "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", "refId": "A" } ], @@ -307,7 +425,7 @@ data: } }, { - "id": 6, + "id": 8, "type": "piechart", "title": "Namespace memory share", "datasource": { @@ -318,11 +436,11 @@ data: "h": 9, "w": 12, "x": 12, - "y": 6 + "y": 5 }, "targets": [ { - "expr": "topk(8, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", + "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", "refId": "A" } ], @@ -348,7 +466,7 @@ data: } }, { - "id": 7, + "id": 9, "type": "timeseries", "title": "Node CPU usage (per node)", "datasource": { @@ -359,13 +477,13 @@ data: "h": 8, "w": 12, "x": 0, - "y": 15 + "y": 14 }, "targets": [ { - "expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100", + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", - "legendFormat": "{{instance}}" + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -385,7 +503,7 @@ data: } }, { - "id": 8, + "id": 10, "type": "timeseries", "title": "Node memory usage (per node)", "datasource": { @@ -396,13 +514,13 @@ data: "h": 8, "w": 12, "x": 12, - "y": 15 + "y": 14 }, "targets": [ { - "expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)", + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", - "legendFormat": "{{instance}}" + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -422,7 +540,7 @@ data: } }, { - "id": 9, + "id": 11, "type": "table", "title": "Key service availability", "datasource": { @@ -430,46 +548,23 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 0, - "y": 23 + "y": 22 }, "targets": [ { - "expr": "max by (service) (up{service=~\"traefik|gitea|vault|victoria-metrics-single|grafana|alertmanager\"})", + "expr": "label_replace((sum by (deployment,namespace) (kube_deployment_status_replicas_available{deployment=~\"traefik|gitea|grafana\",namespace=~\"traefik|gitea|monitoring\"}) / sum by (deployment,namespace) (kube_deployment_spec_replicas{deployment=~\"traefik|gitea|grafana\",namespace=~\"traefik|gitea|monitoring\"})), \"service\", \"$1\", \"deployment\", \"(.*)\") or label_replace((sum by (statefulset,namespace) (kube_statefulset_status_replicas_ready{statefulset=~\"vault|alertmanager|victoria-metrics-single-server\",namespace=~\"vault|monitoring\"}) / sum by (statefulset,namespace) (kube_statefulset_status_replicas{statefulset=~\"vault|alertmanager|victoria-metrics-single-server\",namespace=~\"vault|monitoring\"})), \"service\", \"$1\", \"statefulset\", \"(.*)\")", "refId": "A" } ], "fieldConfig": { "defaults": { - "mappings": [ - { - "id": 0, - "type": 1, - "value": "0", - "text": "Down" - }, - { - "id": 1, - "type": 1, - "value": "1", - "text": "Up" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - } + "custom": { + "align": "auto" + }, + "unit": "percent" }, "overrides": [] }, @@ -478,22 +573,22 @@ data: } }, { - "id": 10, + "id": 12, "type": "table", - "title": "Failed pods (24h trend)", + "title": "Failed pods by namespace (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 23 + "y": 22 }, "targets": [ { - "expr": "topk(10, sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h])) by (namespace))", + "expr": "topk(10, sum by (namespace) (increase(kube_pod_status_phase{phase=\"Failed\"}[24h])))", "refId": "A" } ], @@ -508,9 +603,9 @@ data: } }, { - "id": 11, + "id": 13, "type": "timeseries", - "title": "Cluster network throughput", + "title": "Root filesystem usage per node", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -519,23 +614,18 @@ data: "h": 8, "w": 12, "x": 0, - "y": 31 + "y": 29 }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\"}[5m]))", + "expr": "avg by (node) (((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", - "legendFormat": "Receive" - }, - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]))", - "refId": "B", - "legendFormat": "Transmit" + "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { - "unit": "Bps" + "unit": "percent" }, "overrides": [] }, @@ -550,9 +640,9 @@ data: } }, { - "id": 12, - "type": "timeseries", - "title": "Storage usage across nodes", + "id": 14, + "type": "bargauge", + "title": "Nodes closest to full root disks", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -561,40 +651,377 @@ data: "h": 8, "w": 12, "x": 12, - "y": 31 + "y": 29 }, "targets": [ { - "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) * 100)", + "expr": "topk(8, avg by (node) (((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))", "refId": "A" } ], "fieldConfig": { "defaults": { + "unit": "percent", + "min": 0, + "max": 100 + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 15, + "type": "stat", + "title": "Astreae usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 37 + }, + "targets": [ + { + "expr": "(sum(longhorn_disk_usage_bytes{disk=~\"astreae-.*\"}) / sum(longhorn_disk_capacity_bytes{disk=~\"astreae-.*\"})) * 100", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, "unit": "percent" }, "overrides": [] }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 16, + "type": "stat", + "title": "Asteria usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 37 + }, + "targets": [ + { + "expr": "(sum(longhorn_disk_usage_bytes{disk=~\"asteria-.*\"}) / sum(longhorn_disk_capacity_bytes{disk=~\"asteria-.*\"})) * 100", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 17, + "type": "stat", + "title": "Astreae schedulable", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 37 + }, + "targets": [ + { + "expr": "(sum(longhorn_disk_capacity_bytes{disk=~\"astreae-.*\"}) - sum(longhorn_disk_usage_bytes{disk=~\"astreae-.*\"}) - sum(longhorn_disk_reservation_bytes{disk=~\"astreae-.*\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 18, + "type": "stat", + "title": "Asteria schedulable", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 37 + }, + "targets": [ + { + "expr": "(sum(longhorn_disk_capacity_bytes{disk=~\"asteria-.*\"}) - sum(longhorn_disk_usage_bytes{disk=~\"asteria-.*\"}) - sum(longhorn_disk_reservation_bytes{disk=~\"asteria-.*\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 19, + "type": "piechart", + "title": "Longhorn node readiness", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 44 + }, + "targets": [ + { + "expr": "sum(longhorn_node_status{condition=\"ready\"})", + "refId": "A", + "legendFormat": "Ready" + }, + { + "expr": "(longhorn_node_count_total - sum(longhorn_node_status{condition=\"ready\"}))", + "refId": "B", + "legendFormat": "Offline" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, "options": { "legend": { "displayMode": "list", - "placement": "bottom" + "placement": "right" + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false } } }, { - "id": 13, + "id": 20, + "type": "piechart", + "title": "Longhorn disk schedulability", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 44 + }, + "targets": [ + { + "expr": "sum(sum by (node,disk) (longhorn_disk_status{condition=\"schedulable\"}))", + "refId": "A", + "legendFormat": "Schedulable" + }, + { + "expr": "(count(sum by (node,disk) (longhorn_disk_status{condition=\"ready\"})) - sum(sum by (node,disk) (longhorn_disk_status{condition=\"schedulable\"})))", + "refId": "B", + "legendFormat": "Blocked" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 21, "type": "text", "title": "About this dashboard", "gridPos": { - "h": 6, + "h": 5, "w": 24, "x": 0, - "y": 39 + "y": 51 }, "options": { - "content": "### Atlas at a glance\n- Raspberry Pi + Jetson hybrid cluster with Flux-managed GitOps\n- Metrics powered by VictoriaMetrics, visualized by Grafana\n- Login for SRE mode with pod-level drilldowns, alert routes, and storage health", - "mode": "markdown" + "mode": "markdown", + "content": "### Atlas at a glance\n- Flux-managed Pi + Jetson cluster with 20+ active nodes\n- Longhorn tiers: Astreae (3x replicas) & Asteria (2x replicas) tracked separately\n- Login for the SRE view with alert routing, Longhorn drilldowns, and workload burn rates" } } ], @@ -614,6 +1041,5 @@ data: }, "title": "Atlas Public Overview", "uid": "atlas-public", - "version": 3 + "version": 5 } - diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-sre.yaml index b46c17a..d5d8dca 100644 --- a/services/monitoring/grafana-dashboard-sre.yaml +++ b/services/monitoring/grafana-dashboard-sre.yaml @@ -45,7 +45,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / count(kube_node_info) * 100", "refId": "A" } ], @@ -86,7 +86,8 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { @@ -146,7 +147,8 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { @@ -206,7 +208,8 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { @@ -266,7 +269,8 @@ data: ], "fields": "", "values": false - } + }, + "textMode": "value" } }, { @@ -285,9 +289,9 @@ data: }, "targets": [ { - "expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100", + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", - "legendFormat": "{{instance}}" + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -322,9 +326,9 @@ data: }, "targets": [ { - "expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)", + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", "refId": "A", - "legendFormat": "{{instance}}" + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -575,6 +579,5 @@ data: }, "title": "Atlas SRE Overview", "uid": "atlas-sre", - "version": 2 + "version": 4 } - From a41f25e66d7bbc02ea3fb287920f4eb4bfda686d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 14:22:46 -0300 Subject: [PATCH 13/71] monitoring: restructure grafana dashboards --- scripts/render_dashboards.py | 605 ++++++++ .../monitoring/dashboards/atlas-nodes.json | 369 +++++ .../monitoring/dashboards/atlas-overview.json | 1270 +++++++++++++++++ .../monitoring/dashboards/atlas-pods.json | 137 ++ .../monitoring/dashboards/atlas-storage.json | 359 +++++ ...-sre.yaml => grafana-dashboard-nodes.yaml} | 331 +---- ...c.yaml => grafana-dashboard-overview.yaml} | 716 ++++++---- .../monitoring/grafana-dashboard-pods.yaml | 146 ++ .../monitoring/grafana-dashboard-storage.yaml | 368 +++++ services/monitoring/grafana-folders.yaml | 22 +- services/monitoring/helmrelease.yaml | 48 +- services/monitoring/kustomization.yaml | 6 +- 12 files changed, 3847 insertions(+), 530 deletions(-) create mode 100755 scripts/render_dashboards.py create mode 100644 services/monitoring/dashboards/atlas-nodes.json create mode 100644 services/monitoring/dashboards/atlas-overview.json create mode 100644 services/monitoring/dashboards/atlas-pods.json create mode 100644 services/monitoring/dashboards/atlas-storage.json rename services/monitoring/{grafana-dashboard-sre.yaml => grafana-dashboard-nodes.yaml} (53%) rename services/monitoring/{grafana-dashboard-public.yaml => grafana-dashboard-overview.yaml} (67%) create mode 100644 services/monitoring/grafana-dashboard-pods.yaml create mode 100644 services/monitoring/grafana-dashboard-storage.yaml diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py new file mode 100755 index 0000000..fa9ef58 --- /dev/null +++ b/scripts/render_dashboards.py @@ -0,0 +1,605 @@ +#!/usr/bin/env python3 +"""Generate Grafana dashboards and render them into ConfigMaps. + +Usage: + python scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps + python scripts/render_dashboards.py # just render ConfigMaps +""" +import argparse +import json +import textwrap +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards" +CONFIG_TEMPLATE = textwrap.dedent( + """# {relative_path} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {name} + labels: + grafana_dashboard: "1" +data: + {key}: | +{payload} +""" +) + +PROM_DS = {"type": "prometheus", "uid": "atlas-vm"} + + +# --------------------------------------------------------------------------- # +# Panel helper factories +# --------------------------------------------------------------------------- # + + +def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None, + text_mode="value", legend=None): + defaults = { + "color": {"mode": "palette-classic"}, + "mappings": [], + "thresholds": thresholds + or { + "mode": "absolute", + "steps": [ + {"color": "rgba(115, 115, 115, 1)", "value": None}, + {"color": "green", "value": 1}, + ], + }, + "unit": unit, + } + panel = { + "id": panel_id, + "type": "stat", + "title": title, + "datasource": PROM_DS, + "gridPos": grid, + "targets": [{"expr": expr, "refId": "A"}], + "fieldConfig": {"defaults": defaults, "overrides": []}, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, + "textMode": text_mode, + }, + } + if legend: + panel["targets"][0]["legendFormat"] = legend + return panel + + +def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None, + legend_display="table", legend_placement="bottom", + legend_calcs=None, time_from=None): + panel = { + "id": panel_id, + "type": "timeseries", + "title": title, + "datasource": PROM_DS, + "gridPos": grid, + "targets": [{"expr": expr, "refId": "A"}], + "fieldConfig": {"defaults": {"unit": unit}, "overrides": []}, + "options": { + "legend": { + "displayMode": legend_display, + "placement": legend_placement, + }, + "tooltip": {"mode": "multi"}, + }, + } + if legend: + panel["targets"][0]["legendFormat"] = legend + if legend_calcs: + panel["options"]["legend"]["calcs"] = legend_calcs + if time_from: + panel["timeFrom"] = time_from + return panel + + +def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=None, + description=None): + panel = { + "id": panel_id, + "type": "table", + "title": title, + "datasource": PROM_DS, + "gridPos": grid, + "targets": [{"expr": expr, "refId": "A"}], + "fieldConfig": {"defaults": {"unit": unit}, "overrides": []}, + "options": {"showHeader": True}, + } + if transformations: + panel["transformations"] = transformations + if description: + panel["description"] = description + return panel + + +def pie_panel(panel_id, title, expr, grid): + return { + "id": panel_id, + "type": "piechart", + "title": title, + "datasource": PROM_DS, + "gridPos": grid, + "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], + "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []}, + "options": { + "legend": {"displayMode": "list", "placement": "right"}, + "pieType": "pie", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, + }, + } + + +def text_panel(panel_id, title, content, grid): + return { + "id": panel_id, + "type": "text", + "title": title, + "gridPos": grid, + "datasource": None, + "options": {"mode": "markdown", "content": content}, + } + + +def node_cpu_expr(scope=""): + expr = "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))" + if scope: + expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}" + return expr + + +def node_mem_expr(scope=""): + expr = "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))" + if scope: + expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}" + return expr + + +def root_usage_expr(): + return "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)" + + +def astreae_usage_expr(mount): + return ( + f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / " + f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)" + ) + + +def astreae_free_expr(mount): + return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" + + +def build_overview(): + thresholds_percent = { + "mode": "percentage", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 85}, + ], + } + panels = [] + stats = [ + (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})'), + (2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})'), + (3, "Control plane ready", 'sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})'), + (4, "Control plane schedulable", 'sum(kube_node_spec_unschedulable{node=~"titan-0a|titan-0b|titan-0c"} == 0)'), + (5, "Problem pods", 'sum(kube_pod_status_phase{phase!~"Running|Succeeded"})'), + (6, "Stuck terminating", 'sum(((time() - kube_pod_deletion_timestamp) > 600))'), + ] + for idx, (panel_id, title, expr) in enumerate(stats): + panels.append( + stat_panel( + panel_id, + title, + expr, + {"h": 5, "w": 4, "x": 4 * idx, "y": 0}, + ) + ) + panels.append( + stat_panel( + 7, + "Hottest node: CPU", + node_cpu_expr(), + {"h": 5, "w": 4, "x": 24, "y": 0}, + unit="percent", + thresholds=thresholds_percent, + text_mode="value_and_name", + legend="{{node}}", + ) + ) + panels.append( + stat_panel( + 8, + "Hottest node: RAM", + node_mem_expr(), + {"h": 5, "w": 4, "x": 28, "y": 0}, + unit="percent", + thresholds=thresholds_percent, + text_mode="value_and_name", + legend="{{node}}", + ) + ) + + panels.append(pie_panel(9, "Namespace CPU share", 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 5})) + panels.append(pie_panel(10, "Namespace RAM share", 'topk(10, sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace))', {"h": 9, "w": 12, "x": 12, "y": 5})) + + panels.append( + timeseries_panel( + 11, + "Cluster node CPU", + node_cpu_expr(), + {"h": 8, "w": 12, "x": 0, "y": 14}, + unit="percent", + legend="{{node}}", + legend_calcs=["last"], + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 12, + "Cluster node RAM", + node_mem_expr(), + {"h": 8, "w": 12, "x": 12, "y": 14}, + unit="percent", + legend="{{node}}", + legend_calcs=["last"], + legend_display="table", + legend_placement="right", + ) + ) + + panels.append( + table_panel( + 13, + "Problem pods (details)", + "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + {"h": 8, "w": 12, "x": 0, "y": 22}, + unit="s", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + table_panel( + 14, + "Terminating >10m", + "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + {"h": 8, "w": 12, "x": 12, "y": 22}, + unit="s", + transformations=[ + {"id": "labelsToFields", "options": {}} , + {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, + ], + ) + ) + + panels.append( + timeseries_panel( + 15, + "Control plane CPU", + node_cpu_expr("titan-0a|titan-0b|titan-0c"), + {"h": 7, "w": 12, "x": 0, "y": 30}, + unit="percent", + legend="{{node}}", + ) + ) + panels.append( + timeseries_panel( + 16, + "Control plane RAM", + node_mem_expr("titan-0a|titan-0b|titan-0c"), + {"h": 7, "w": 12, "x": 12, "y": 30}, + unit="percent", + legend="{{node}}", + ) + ) + + panels.append( + timeseries_panel( + 17, + "Root filesystem usage", + root_usage_expr(), + {"h": 8, "w": 12, "x": 0, "y": 37}, + unit="percent", + legend="{{node}}", + legend_calcs=["last"], + legend_display="table", + legend_placement="right", + time_from="7d", + ) + ) + + panels.append( + { + "id": 18, + "type": "bargauge", + "title": "Nodes closest to full root disks", + "datasource": PROM_DS, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 37}, + "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "percentage", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 50}, + {"color": "orange", "value": 70}, + {"color": "red", "value": 85}, + ], + }, + }, + "overrides": [], + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, + }, + } + ) + + panels.append( + stat_panel( + 19, + "Astreae usage", + astreae_usage_expr("/mnt/astreae"), + {"h": 6, "w": 6, "x": 0, "y": 45}, + unit="percent", + thresholds=thresholds_percent, + ) + ) + panels.append( + stat_panel( + 20, + "Asteria usage", + astreae_usage_expr("/mnt/asteria"), + {"h": 6, "w": 6, "x": 6, "y": 45}, + unit="percent", + thresholds=thresholds_percent, + ) + ) + panels.append( + stat_panel( + 21, + "Astreae free", + astreae_free_expr("/mnt/astreae"), + {"h": 6, "w": 6, "x": 12, "y": 45}, + unit="bytesSI", + ) + ) + panels.append( + stat_panel( + 22, + "Asteria free", + astreae_free_expr("/mnt/asteria"), + {"h": 6, "w": 6, "x": 18, "y": 45}, + unit="bytesSI", + ) + ) + + panels.append( + table_panel( + 23, + "Astreae per-node usage", + '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', + {"h": 8, "w": 12, "x": 0, "y": 51}, + unit="percent", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + table_panel( + 24, + "Asteria per-node usage", + '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', + {"h": 8, "w": 12, "x": 12, "y": 51}, + unit="percent", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + + panels.append( + text_panel( + 25, + "About this dashboard", + "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders", + {"h": 5, "w": 24, "x": 0, "y": 59}, + ) + ) + + return { + "uid": "atlas-overview", + "title": "Atlas Overview", + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": {"type": "datasource", "uid": "grafana"}, + "enable": True, + "hide": True, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard", + } + ] + }, + "editable": False, + "folderUid": "atlas-overview", + "graphTooltip": 0, + "links": [ + {"title": "Pods dashboard", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False}, + {"title": "Nodes dashboard", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False}, + {"title": "Storage dashboard", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False}, + ], + "panels": panels, + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "overview"], + "templating": {"list": []}, + "time": {"from": "now-12h", "to": "now"}, + } + + +def build_pods_dashboard(): + panels = [] + panels.append( + table_panel( + 1, + "Pods not running", + "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + {"h": 10, "w": 24, "x": 0, "y": 0}, + unit="s", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + table_panel( + 2, + "CrashLoop / ImagePull", + "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", + {"h": 10, "w": 24, "x": 0, "y": 10}, + unit="s", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + table_panel( + 3, + "Terminating pods", + "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + {"h": 10, "w": 24, "x": 0, "y": 20}, + unit="s", + transformations=[ + {"id": "labelsToFields", "options": {}} , + {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, + ], + ) + ) + return { + "uid": "atlas-pods", + "title": "Atlas Pods", + "folderUid": "atlas-pods", + "editable": True, + "panels": panels, + "time": {"from": "now-12h", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "pods"], + } + + +def build_nodes_dashboard(): + panels = [] + panels.append(stat_panel(1, "Node count", 'count(kube_node_info)', {"h": 5, "w": 6, "x": 0, "y": 0})) + panels.append(stat_panel(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})', {"h": 5, "w": 6, "x": 6, "y": 0})) + panels.append(stat_panel(3, "Control plane CPU avg", node_cpu_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name")) + panels.append(stat_panel(4, "Control plane RAM avg", node_mem_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name")) + panels.append(timeseries_panel(5, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right")) + panels.append(timeseries_panel(6, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 14}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right")) + panels.append(timeseries_panel(7, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 23}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="7d")) + return { + "uid": "atlas-nodes", + "title": "Atlas Nodes", + "folderUid": "atlas-nodes", + "editable": True, + "panels": panels, + "time": {"from": "now-12h", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "nodes"], + } + + +def build_storage_dashboard(): + panels = [] + panels.append(stat_panel(1, "Astreae usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent")) + panels.append(stat_panel(2, "Asteria usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent")) + panels.append(stat_panel(3, "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="bytesSI")) + panels.append(stat_panel(4, "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="bytesSI")) + panels.append(timeseries_panel(5, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d")) + panels.append(table_panel(6, "Astreae nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 0, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}])) + panels.append(table_panel(7, "Asteria nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 12, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}])) + return { + "uid": "atlas-storage", + "title": "Atlas Storage", + "folderUid": "atlas-storage", + "editable": True, + "panels": panels, + "time": {"from": "now-12h", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "storage"], + } + + +DASHBOARDS = { + "atlas-overview": { + "builder": build_overview, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml", + }, + "atlas-pods": { + "builder": build_pods_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml", + }, + "atlas-nodes": { + "builder": build_nodes_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml", + }, + "atlas-storage": { + "builder": build_storage_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml", + }, +} + + +def write_json(uid: str, data: dict) -> None: + DASHBOARD_DIR.mkdir(parents=True, exist_ok=True) + path = DASHBOARD_DIR / f"{uid}.json" + path.write_text(json.dumps(data, indent=2) + "\n") + + +def render_configmap(uid: str, data: dict) -> None: + json_path = DASHBOARD_DIR / f"{uid}.json" + payload = json.dumps(json.loads(json_path.read_text()), indent=2) + indented = "\n".join(" " + line for line in payload.splitlines()) + output_path = data["configmap"] + content = CONFIG_TEMPLATE.format( + relative_path=output_path.relative_to(ROOT), + name=output_path.stem, + key=json_path.name, + payload=indented, + ) + output_path.write_text(content) + print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}") + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders") + args = parser.parse_args() + + if args.build: + for uid, info in DASHBOARDS.items(): + write_json(uid, info["builder"]()) + + for uid, info in DASHBOARDS.items(): + render_configmap(uid, info) + + +if __name__ == "__main__": + main() diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json new file mode 100644 index 0000000..d3393a9 --- /dev/null +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -0,0 +1,369 @@ +{ + "uid": "atlas-nodes", + "title": "Atlas Nodes", + "folderUid": "atlas-nodes", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Node count", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "count(kube_node_info)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Ready nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Control plane CPU avg", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 4, + "type": "stat", + "title": "Control plane RAM avg", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 5, + "type": "timeseries", + "title": "Node CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 5 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 6, + "type": "timeseries", + "title": "Node RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 7, + "type": "timeseries", + "title": "Root filesystem", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 23 + }, + "targets": [ + { + "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "7d" + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "nodes" + ] +} diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json new file mode 100644 index 0000000..d7a0d27 --- /dev/null +++ b/services/monitoring/dashboards/atlas-overview.json @@ -0,0 +1,1270 @@ +{ + "uid": "atlas-overview", + "title": "Atlas Overview", + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": false, + "folderUid": "atlas-overview", + "graphTooltip": 0, + "links": [ + { + "title": "Pods dashboard", + "type": "dashboard", + "dashboardUid": "atlas-pods", + "keepTime": false + }, + { + "title": "Nodes dashboard", + "type": "dashboard", + "dashboardUid": "atlas-nodes", + "keepTime": false + }, + { + "title": "Storage dashboard", + "type": "dashboard", + "dashboardUid": "atlas-storage", + "keepTime": false + } + ], + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Running pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Ready nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Control plane ready", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Control plane schedulable", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Problem pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Stuck terminating", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "targets": [ + { + "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Hottest node: CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 24, + "y": 0 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 8, + "type": "stat", + "title": "Hottest node: RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 28, + "y": 0 + }, + "targets": [ + { + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 9, + "type": "piechart", + "title": "Namespace CPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 5 + }, + "targets": [ + { + "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 10, + "type": "piechart", + "title": "Namespace RAM share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "targets": [ + { + "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 11, + "type": "timeseries", + "title": "Cluster node CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 12, + "type": "timeseries", + "title": "Cluster node RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "targets": [ + { + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 13, + "type": "table", + "title": "Problem pods (details)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "targets": [ + { + "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 14, + "type": "table", + "title": "Terminating >10m", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "targets": [ + { + "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "filterByValue", + "options": { + "match": "Value", + "operator": "gt", + "value": 600 + } + } + ] + }, + { + "id": 15, + "type": "timeseries", + "title": "Control plane CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 30 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 16, + "type": "timeseries", + "title": "Control plane RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 30 + }, + "targets": [ + { + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 17, + "type": "timeseries", + "title": "Root filesystem usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 37 + }, + "targets": [ + { + "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "7d" + }, + { + "id": 18, + "type": "bargauge", + "title": "Nodes closest to full root disks", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 37 + }, + "targets": [ + { + "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 19, + "type": "stat", + "title": "Astreae usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 45 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 20, + "type": "stat", + "title": "Asteria usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 45 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 21, + "type": "stat", + "title": "Astreae free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 45 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 22, + "type": "stat", + "title": "Asteria free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 45 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 23, + "type": "table", + "title": "Astreae per-node usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 51 + }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 24, + "type": "table", + "title": "Asteria per-node usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 51 + }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 25, + "type": "text", + "title": "About this dashboard", + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 59 + }, + "datasource": null, + "options": { + "mode": "markdown", + "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders" + } + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "overview" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-12h", + "to": "now" + } +} diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json new file mode 100644 index 0000000..91f80eb --- /dev/null +++ b/services/monitoring/dashboards/atlas-pods.json @@ -0,0 +1,137 @@ +{ + "uid": "atlas-pods", + "title": "Atlas Pods", + "folderUid": "atlas-pods", + "editable": true, + "panels": [ + { + "id": 1, + "type": "table", + "title": "Pods not running", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 2, + "type": "table", + "title": "CrashLoop / ImagePull", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 10 + }, + "targets": [ + { + "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 3, + "type": "table", + "title": "Terminating pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 20 + }, + "targets": [ + { + "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "filterByValue", + "options": { + "match": "Value", + "operator": "gt", + "value": 600 + } + } + ] + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "pods" + ] +} diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json new file mode 100644 index 0000000..aa1948d --- /dev/null +++ b/services/monitoring/dashboards/atlas-storage.json @@ -0,0 +1,359 @@ +{ + "uid": "atlas-storage", + "title": "Atlas Storage", + "folderUid": "atlas-storage", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Astreae usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Asteria usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Astreae free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Asteria free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "timeseries", + "title": "Root filesystem", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 5 + }, + "targets": [ + { + "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d" + }, + { + "id": 6, + "type": "table", + "title": "Astreae nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 7, + "type": "table", + "title": "Asteria nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 14 + }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "storage" + ] +} diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-nodes.yaml similarity index 53% rename from services/monitoring/grafana-dashboard-sre.yaml rename to services/monitoring/grafana-dashboard-nodes.yaml index d5d8dca..516f207 100644 --- a/services/monitoring/grafana-dashboard-sre.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -1,38 +1,22 @@ -# services/monitoring/grafana-dashboard-sre.yaml +# services/monitoring/grafana-dashboard-nodes.yaml apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboard-sre + name: grafana-dashboard-nodes labels: grafana_dashboard: "1" data: - atlas-sre-overview.json: | + atlas-nodes.json: | { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, + "uid": "atlas-nodes", + "title": "Atlas Nodes", + "folderUid": "atlas-nodes", "editable": true, - "folderUid": "atlas-sre", - "graphTooltip": 0, - "links": [], "panels": [ { "id": 1, "type": "stat", - "title": "Ready nodes", + "title": "Node count", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -45,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / count(kube_node_info) * 100", + "expr": "count(kube_node_info)", "refId": "A" } ], @@ -56,23 +40,19 @@ data: }, "mappings": [], "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { - "color": "red", + "color": "rgba(115, 115, 115, 1)", "value": null }, - { - "color": "yellow", - "value": 95 - }, { "color": "green", - "value": 99 + "value": 1 } ] }, - "unit": "percent" + "unit": "none" }, "overrides": [] }, @@ -93,7 +73,7 @@ data: { "id": 2, "type": "stat", - "title": "Pending pods", + "title": "Ready nodes", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -106,7 +86,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_status_phase{phase=\"Pending\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", "refId": "A" } ], @@ -120,16 +100,12 @@ data: "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", - "value": 3 - }, - { - "color": "red", - "value": 10 + "color": "green", + "value": 1 } ] }, @@ -154,7 +130,7 @@ data: { "id": 3, "type": "stat", - "title": "Unavailable deployment replicas", + "title": "Control plane CPU avg", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -167,8 +143,9 @@ data: }, "targets": [ { - "expr": "sum(kube_deployment_status_replicas_unavailable)", - "refId": "A" + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -181,20 +158,16 @@ data: "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", + "color": "green", "value": 1 - }, - { - "color": "red", - "value": 3 } ] }, - "unit": "none" + "unit": "percent" }, "overrides": [] }, @@ -209,13 +182,13 @@ data: "fields": "", "values": false }, - "textMode": "value" + "textMode": "value_and_name" } }, { "id": 4, "type": "stat", - "title": "Active alerts", + "title": "Control plane RAM avg", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -228,8 +201,9 @@ data: }, "targets": [ { - "expr": "sum(ALERTS{alertstate=\"firing\"})", - "refId": "A" + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -242,20 +216,16 @@ data: "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", + "color": "green", "value": 1 - }, - { - "color": "red", - "value": 3 } ] }, - "unit": "none" + "unit": "percent" }, "overrides": [] }, @@ -270,20 +240,20 @@ data: "fields": "", "values": false }, - "textMode": "value" + "textMode": "value_and_name" } }, { "id": 5, "type": "timeseries", - "title": "Node CPU usage", + "title": "Node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 12, + "w": 24, "x": 0, "y": 5 }, @@ -303,7 +273,10 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right", + "calcs": [ + "last" + ] }, "tooltip": { "mode": "multi" @@ -313,16 +286,16 @@ data: { "id": 6, "type": "timeseries", - "title": "Node memory usage", + "title": "Node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 12, - "x": 12, - "y": 5 + "w": 24, + "x": 0, + "y": 14 }, "targets": [ { @@ -340,7 +313,10 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right", + "calcs": [ + "last" + ] }, "tooltip": { "mode": "multi" @@ -350,201 +326,22 @@ data: { "id": 7, "type": "timeseries", - "title": "Top pod CPU (5m avg)", + "title": "Root filesystem", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 12, - "x": 0, - "y": 14 - }, - "targets": [ - { - "expr": "topk(5, sum(rate(container_cpu_usage_seconds_total{pod!=\"\",container!=\"\"}[5m])) by (namespace,pod))", - "refId": "A", - "legendFormat": "{{namespace}}/{{pod}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "cores" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 8, - "type": "timeseries", - "title": "Top pod memory working set", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 14 - }, - "targets": [ - { - "expr": "topk(5, sum(container_memory_working_set_bytes{pod!=\"\",container!=\"\"}) by (namespace,pod))", - "refId": "A", - "legendFormat": "{{namespace}}/{{pod}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "bytes" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 9, - "type": "bargauge", - "title": "Namespace restart rate (6h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, + "w": 24, "x": 0, "y": 23 }, "targets": [ { - "expr": "topk(8, sum(increase(kube_pod_container_status_restarts_total{namespace!=\"\"}[6h])) by (namespace))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 10, - "type": "table", - "title": "Deployments missing replicas", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 23 - }, - "targets": [ - { - "expr": "topk(10, sum by (namespace,deployment) (kube_deployment_status_replicas_unavailable))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "showHeader": true - } - }, - { - "id": 11, - "type": "timeseries", - "title": "Pod phase breakdown", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 31 - }, - "targets": [ - { - "expr": "sum(kube_pod_status_phase) by (phase)", + "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", "refId": "A", - "legendFormat": "{{phase}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 12, - "type": "timeseries", - "title": "PVC usage (top 8)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 31 - }, - "targets": [ - { - "expr": "topk(8, sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))", - "refId": "A", - "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}" + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -556,28 +353,26 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right" }, "tooltip": { "mode": "multi" } - } + }, + "timeFrom": "7d" } ], - "schemaVersion": 39, - "style": "dark", - "tags": [ - "atlas", - "sre" - ], - "templating": { - "list": [] - }, "time": { "from": "now-12h", "to": "now" }, - "title": "Atlas SRE Overview", - "uid": "atlas-sre", - "version": 4 + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "nodes" + ] } diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-overview.yaml similarity index 67% rename from services/monitoring/grafana-dashboard-public.yaml rename to services/monitoring/grafana-dashboard-overview.yaml index 35fa124..a20e05a 100644 --- a/services/monitoring/grafana-dashboard-public.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1,13 +1,15 @@ -# services/monitoring/grafana-dashboard-public.yaml +# services/monitoring/grafana-dashboard-overview.yaml apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboard-public + name: grafana-dashboard-overview labels: grafana_dashboard: "1" data: - atlas-public-overview.json: | + atlas-overview.json: | { + "uid": "atlas-overview", + "title": "Atlas Overview", "annotations": { "list": [ { @@ -25,9 +27,28 @@ data: ] }, "editable": false, - "folderUid": "atlas-public", + "folderUid": "atlas-overview", "graphTooltip": 0, - "links": [], + "links": [ + { + "title": "Pods dashboard", + "type": "dashboard", + "dashboardUid": "atlas-pods", + "keepTime": false + }, + { + "title": "Nodes dashboard", + "type": "dashboard", + "dashboardUid": "atlas-nodes", + "keepTime": false + }, + { + "title": "Storage dashboard", + "type": "dashboard", + "dashboardUid": "atlas-storage", + "keepTime": false + } + ], "panels": [ { "id": 1, @@ -146,7 +167,7 @@ data: { "id": 3, "type": "stat", - "title": "Cluster nodes", + "title": "Control plane ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -159,7 +180,7 @@ data: }, "targets": [ { - "expr": "count(kube_node_info)", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})", "refId": "A" } ], @@ -203,7 +224,7 @@ data: { "id": 4, "type": "stat", - "title": "Hottest node CPU", + "title": "Control plane schedulable", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -216,10 +237,182 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)))", + "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)", "refId": "A" } ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Problem pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Stuck terminating", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "targets": [ + { + "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Hottest node: CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 24, + "y": 0 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], "fieldConfig": { "defaults": { "color": { @@ -262,9 +455,9 @@ data: } }, { - "id": 5, + "id": 8, "type": "stat", - "title": "Hottest node memory", + "title": "Hottest node: RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -272,13 +465,14 @@ data: "gridPos": { "h": 5, "w": 4, - "x": 16, + "x": 28, "y": 0 }, "targets": [ { - "expr": "topk(1, avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))", - "refId": "A" + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -296,11 +490,11 @@ data: }, { "color": "yellow", - "value": 75 + "value": 70 }, { "color": "red", - "value": 90 + "value": 85 } ] }, @@ -323,68 +517,7 @@ data: } }, { - "id": 6, - "type": "stat", - "title": "Failed pods (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 20, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 7, + "id": 9, "type": "piechart", "title": "Namespace CPU share", "datasource": { @@ -400,12 +533,13 @@ data: "targets": [ { "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", - "refId": "A" + "refId": "A", + "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { - "unit": "cores" + "unit": "percent" }, "overrides": [] }, @@ -425,9 +559,9 @@ data: } }, { - "id": 8, + "id": 10, "type": "piechart", - "title": "Namespace memory share", + "title": "Namespace RAM share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -441,12 +575,13 @@ data: "targets": [ { "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", - "refId": "A" + "refId": "A", + "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { - "unit": "bytes" + "unit": "percent" }, "overrides": [] }, @@ -455,7 +590,7 @@ data: "displayMode": "list", "placement": "right" }, - "pieType": "donut", + "pieType": "pie", "reduceOptions": { "calcs": [ "lastNotNull" @@ -466,9 +601,9 @@ data: } }, { - "id": 9, + "id": 11, "type": "timeseries", - "title": "Node CPU usage (per node)", + "title": "Cluster node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -495,7 +630,10 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right", + "calcs": [ + "last" + ] }, "tooltip": { "mode": "multi" @@ -503,9 +641,9 @@ data: } }, { - "id": 10, + "id": 12, "type": "timeseries", - "title": "Node memory usage (per node)", + "title": "Cluster node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -532,80 +670,20 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right", + "calcs": [ + "last" + ] }, "tooltip": { "mode": "multi" } } }, - { - "id": 11, - "type": "table", - "title": "Key service availability", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 22 - }, - "targets": [ - { - "expr": "label_replace((sum by (deployment,namespace) (kube_deployment_status_replicas_available{deployment=~\"traefik|gitea|grafana\",namespace=~\"traefik|gitea|monitoring\"}) / sum by (deployment,namespace) (kube_deployment_spec_replicas{deployment=~\"traefik|gitea|grafana\",namespace=~\"traefik|gitea|monitoring\"})), \"service\", \"$1\", \"deployment\", \"(.*)\") or label_replace((sum by (statefulset,namespace) (kube_statefulset_status_replicas_ready{statefulset=~\"vault|alertmanager|victoria-metrics-single-server\",namespace=~\"vault|monitoring\"}) / sum by (statefulset,namespace) (kube_statefulset_status_replicas{statefulset=~\"vault|alertmanager|victoria-metrics-single-server\",namespace=~\"vault|monitoring\"})), \"service\", \"$1\", \"statefulset\", \"(.*)\")", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "custom": { - "align": "auto" - }, - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "showHeader": true - } - }, - { - "id": 12, - "type": "table", - "title": "Failed pods by namespace (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 22 - }, - "targets": [ - { - "expr": "topk(10, sum by (namespace) (increase(kube_pod_status_phase{phase=\"Failed\"}[24h])))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "showHeader": true - } - }, { "id": 13, - "type": "timeseries", - "title": "Root filesystem usage per node", + "type": "table", + "title": "Problem pods (details)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -614,11 +692,91 @@ data: "h": 8, "w": 12, "x": 0, - "y": 29 + "y": 22 }, "targets": [ { - "expr": "avg by (node) (((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 14, + "type": "table", + "title": "Terminating >10m", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "targets": [ + { + "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "filterByValue", + "options": { + "match": "Value", + "operator": "gt", + "value": 600 + } + } + ] + }, + { + "id": 15, + "type": "timeseries", + "title": "Control plane CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 30 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", "refId": "A", "legendFormat": "{{node}}" } @@ -640,7 +798,85 @@ data: } }, { - "id": 14, + "id": 16, + "type": "timeseries", + "title": "Control plane RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 30 + }, + "targets": [ + { + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 17, + "type": "timeseries", + "title": "Root filesystem usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 37 + }, + "targets": [ + { + "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "7d" + }, + { + "id": 18, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -651,19 +887,41 @@ data: "h": 8, "w": 12, "x": 12, - "y": 29 + "y": 37 }, "targets": [ { - "expr": "topk(8, avg by (node) (((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))", - "refId": "A" + "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "min": 0, - "max": 100 + "max": 100, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } }, "overrides": [] }, @@ -680,7 +938,7 @@ data: } }, { - "id": 15, + "id": 19, "type": "stat", "title": "Astreae usage", "datasource": { @@ -688,14 +946,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 6, "w": 6, "x": 0, - "y": 37 + "y": 45 }, "targets": [ { - "expr": "(sum(longhorn_disk_usage_bytes{disk=~\"astreae-.*\"}) / sum(longhorn_disk_capacity_bytes{disk=~\"astreae-.*\"})) * 100", + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], @@ -741,7 +999,7 @@ data: } }, { - "id": 16, + "id": 20, "type": "stat", "title": "Asteria usage", "datasource": { @@ -749,14 +1007,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 6, "w": 6, "x": 6, - "y": 37 + "y": 45 }, "targets": [ { - "expr": "(sum(longhorn_disk_usage_bytes{disk=~\"asteria-.*\"}) / sum(longhorn_disk_capacity_bytes{disk=~\"asteria-.*\"})) * 100", + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], @@ -802,22 +1060,22 @@ data: } }, { - "id": 17, + "id": 21, "type": "stat", - "title": "Astreae schedulable", + "title": "Astreae free", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 6, "w": 6, "x": 12, - "y": 37 + "y": 45 }, "targets": [ { - "expr": "(sum(longhorn_disk_capacity_bytes{disk=~\"astreae-.*\"}) - sum(longhorn_disk_usage_bytes{disk=~\"astreae-.*\"}) - sum(longhorn_disk_reservation_bytes{disk=~\"astreae-.*\"}))", + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", "refId": "A" } ], @@ -859,22 +1117,22 @@ data: } }, { - "id": 18, + "id": 22, "type": "stat", - "title": "Asteria schedulable", + "title": "Asteria free", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 6, "w": 6, "x": 18, - "y": 37 + "y": 45 }, "targets": [ { - "expr": "(sum(longhorn_disk_capacity_bytes{disk=~\"asteria-.*\"}) - sum(longhorn_disk_usage_bytes{disk=~\"asteria-.*\"}) - sum(longhorn_disk_reservation_bytes{disk=~\"asteria-.*\"}))", + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", "refId": "A" } ], @@ -916,112 +1174,91 @@ data: } }, { - "id": 19, - "type": "piechart", - "title": "Longhorn node readiness", + "id": 23, + "type": "table", + "title": "Astreae per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 8, "w": 12, "x": 0, - "y": 44 + "y": 51 }, "targets": [ { - "expr": "sum(longhorn_node_status{condition=\"ready\"})", - "refId": "A", - "legendFormat": "Ready" - }, - { - "expr": "(longhorn_node_count_total - sum(longhorn_node_status{condition=\"ready\"}))", - "refId": "B", - "legendFormat": "Offline" + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "none" + "unit": "percent" }, "overrides": [] }, "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "donut", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} } - } + ] }, { - "id": 20, - "type": "piechart", - "title": "Longhorn disk schedulability", + "id": 24, + "type": "table", + "title": "Asteria per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 8, "w": 12, "x": 12, - "y": 44 + "y": 51 }, "targets": [ { - "expr": "sum(sum by (node,disk) (longhorn_disk_status{condition=\"schedulable\"}))", - "refId": "A", - "legendFormat": "Schedulable" - }, - { - "expr": "(count(sum by (node,disk) (longhorn_disk_status{condition=\"ready\"})) - sum(sum by (node,disk) (longhorn_disk_status{condition=\"schedulable\"})))", - "refId": "B", - "legendFormat": "Blocked" + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "none" + "unit": "percent" }, "overrides": [] }, "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "donut", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} } - } + ] }, { - "id": 21, + "id": 25, "type": "text", "title": "About this dashboard", "gridPos": { "h": 5, "w": 24, "x": 0, - "y": 51 + "y": 59 }, + "datasource": null, "options": { "mode": "markdown", - "content": "### Atlas at a glance\n- Flux-managed Pi + Jetson cluster with 20+ active nodes\n- Longhorn tiers: Astreae (3x replicas) & Asteria (2x replicas) tracked separately\n- Login for the SRE view with alert routing, Longhorn drilldowns, and workload burn rates" + "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders" } } ], @@ -1030,7 +1267,7 @@ data: "style": "dark", "tags": [ "atlas", - "public" + "overview" ], "templating": { "list": [] @@ -1038,8 +1275,5 @@ data: "time": { "from": "now-12h", "to": "now" - }, - "title": "Atlas Public Overview", - "uid": "atlas-public", - "version": 5 + } } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml new file mode 100644 index 0000000..3b1f5da --- /dev/null +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -0,0 +1,146 @@ +# services/monitoring/grafana-dashboard-pods.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-pods + labels: + grafana_dashboard: "1" +data: + atlas-pods.json: | + { + "uid": "atlas-pods", + "title": "Atlas Pods", + "folderUid": "atlas-pods", + "editable": true, + "panels": [ + { + "id": 1, + "type": "table", + "title": "Pods not running", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 2, + "type": "table", + "title": "CrashLoop / ImagePull", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 10 + }, + "targets": [ + { + "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 3, + "type": "table", + "title": "Terminating pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 20 + }, + "targets": [ + { + "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "filterByValue", + "options": { + "match": "Value", + "operator": "gt", + "value": 600 + } + } + ] + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "pods" + ] + } diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml new file mode 100644 index 0000000..5b22804 --- /dev/null +++ b/services/monitoring/grafana-dashboard-storage.yaml @@ -0,0 +1,368 @@ +# services/monitoring/grafana-dashboard-storage.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-storage + labels: + grafana_dashboard: "1" +data: + atlas-storage.json: | + { + "uid": "atlas-storage", + "title": "Atlas Storage", + "folderUid": "atlas-storage", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Astreae usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Asteria usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Astreae free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Asteria free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "timeseries", + "title": "Root filesystem", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 5 + }, + "targets": [ + { + "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d" + }, + { + "id": 6, + "type": "table", + "title": "Astreae nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 7, + "type": "table", + "title": "Asteria nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 14 + }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "storage" + ] + } diff --git a/services/monitoring/grafana-folders.yaml b/services/monitoring/grafana-folders.yaml index 503aaee..d390679 100644 --- a/services/monitoring/grafana-folders.yaml +++ b/services/monitoring/grafana-folders.yaml @@ -10,8 +10,8 @@ data: folders.yaml: | apiVersion: 1 folders: - - uid: atlas-public - title: Atlas Public + - uid: atlas-overview + title: Atlas Overview permissions: - role: Viewer permission: View @@ -19,8 +19,22 @@ data: permission: Edit - role: Admin permission: Admin - - uid: atlas-sre - title: Atlas SRE + - uid: atlas-pods + title: Atlas Pods + permissions: + - role: Editor + permission: View + - role: Admin + permission: Admin + - uid: atlas-nodes + title: Atlas Nodes + permissions: + - role: Editor + permission: View + - role: Admin + permission: Admin + - uid: atlas-storage + title: Atlas Storage permissions: - role: Editor permission: View diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 4efae70..e23f903 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -244,8 +244,8 @@ spec: GF_SECURITY_ALLOW_EMBEDDING: "true" grafana.ini: server: - domain: atlas.metrics.bstein.dev - root_url: https://atlas.metrics.bstein.dev/ + domain: metrics.bstein.dev + root_url: https://metrics.bstein.dev/ auth.anonymous: hide_version: true users: @@ -256,12 +256,12 @@ spec: annotations: cert-manager.io/cluster-issuer: letsencrypt hosts: - - atlas.metrics.bstein.dev + - metrics.bstein.dev path: / tls: - - secretName: grafana-atlas-metrics-tls + - secretName: grafana-metrics-tls hosts: - - atlas.metrics.bstein.dev + - metrics.bstein.dev datasources: datasources.yaml: apiVersion: 1 @@ -278,25 +278,43 @@ spec: dashboardproviders.yaml: apiVersion: 1 providers: - - name: public + - name: overview orgId: 1 - folder: Atlas Public + folder: Atlas Overview type: file disableDeletion: false editable: false options: - path: /var/lib/grafana/dashboards/public - - name: sre + path: /var/lib/grafana/dashboards/overview + - name: pods orgId: 1 - folder: Atlas SRE + folder: Atlas Pods type: file disableDeletion: false editable: true options: - path: /var/lib/grafana/dashboards/sre + path: /var/lib/grafana/dashboards/pods + - name: nodes + orgId: 1 + folder: Atlas Nodes + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/nodes + - name: storage + orgId: 1 + folder: Atlas Storage + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/storage dashboardsConfigMaps: - public: grafana-dashboard-public - sre: grafana-dashboard-sre + overview: grafana-dashboard-overview + pods: grafana-dashboard-pods + nodes: grafana-dashboard-nodes + storage: grafana-dashboard-storage extraConfigmapMounts: - name: grafana-folders mountPath: /etc/grafana/provisioning/folders @@ -327,14 +345,14 @@ spec: annotations: cert-manager.io/cluster-issuer: letsencrypt hosts: - - host: atlas.alerts.bstein.dev + - host: alerts.bstein.dev paths: - path: / pathType: Prefix tls: - secretName: alerts-bstein-dev-tls hosts: - - atlas.alerts.bstein.dev + - alerts.bstein.dev config: global: resolve_timeout: 5m diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 73e7d23..282ee4f 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -5,7 +5,9 @@ namespace: monitoring resources: - namespace.yaml - rbac.yaml - - grafana-dashboard-public.yaml - - grafana-dashboard-sre.yaml + - grafana-dashboard-overview.yaml + - grafana-dashboard-pods.yaml + - grafana-dashboard-nodes.yaml + - grafana-dashboard-storage.yaml - grafana-folders.yaml - helmrelease.yaml From 8f5781d3cf22aef5018d84af6b1e592c26c830a9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 16:27:38 -0300 Subject: [PATCH 14/71] monitoring: rebuild atlas dashboards --- scripts/render_dashboards.py | 1009 +++++++++++++---- .../monitoring/dashboards/atlas-network.json | 384 +++++++ .../monitoring/dashboards/atlas-nodes.json | 212 ++-- .../monitoring/dashboards/atlas-overview.json | 872 ++++++++------ .../monitoring/dashboards/atlas-pods.json | 260 ++++- .../monitoring/dashboards/atlas-storage.json | 138 ++- .../monitoring/grafana-dashboard-network.yaml | 393 +++++++ .../monitoring/grafana-dashboard-nodes.yaml | 212 ++-- .../grafana-dashboard-overview.yaml | 872 ++++++++------ .../monitoring/grafana-dashboard-pods.yaml | 260 ++++- .../monitoring/grafana-dashboard-storage.yaml | 138 ++- services/monitoring/grafana-folders.yaml | 18 +- services/monitoring/helmrelease.yaml | 15 +- services/monitoring/kustomization.yaml | 1 + 14 files changed, 3559 insertions(+), 1225 deletions(-) mode change 100755 => 100644 scripts/render_dashboards.py create mode 100644 services/monitoring/dashboards/atlas-network.json create mode 100644 services/monitoring/grafana-dashboard-network.yaml diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py old mode 100755 new mode 100644 index fa9ef58..67e486a --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -1,15 +1,20 @@ #!/usr/bin/env python3 -"""Generate Grafana dashboards and render them into ConfigMaps. +"""Generate Atlas Grafana dashboards and render them into ConfigMaps. Usage: - python scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps - python scripts/render_dashboards.py # just render ConfigMaps + scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps + scripts/render_dashboards.py # re-render ConfigMaps from JSON """ + import argparse import json import textwrap from pathlib import Path +# --------------------------------------------------------------------------- +# Paths, folders, and shared metadata +# --------------------------------------------------------------------------- + ROOT = Path(__file__).resolve().parents[1] DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards" CONFIG_TEMPLATE = textwrap.dedent( @@ -27,15 +32,194 @@ data: ) PROM_DS = {"type": "prometheus", "uid": "atlas-vm"} +PUBLIC_FOLDER = "atlas-overview" +PRIVATE_FOLDER = "atlas-internal" + +PERCENT_THRESHOLDS = { + "mode": "percentage", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 85}, + ], +} + +# --------------------------------------------------------------------------- +# Cluster metadata +# --------------------------------------------------------------------------- + +CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"] +CONTROL_DEPENDENCIES = ["titan-db"] +CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES +WORKER_NODES = [ + "titan-04", + "titan-05", + "titan-06", + "titan-07", + "titan-08", + "titan-09", + "titan-10", + "titan-11", + "titan-12", + "titan-13", + "titan-14", + "titan-15", + "titan-16", + "titan-17", + "titan-18", + "titan-19", + "titan-22", + "titan-24", +] + +CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES) +CONTROL_ALL_REGEX = "|".join(CONTROL_ALL) +WORKER_REGEX = "|".join(WORKER_NODES) +CONTROL_TOTAL = len(CONTROL_PLANE_NODES) +WORKER_TOTAL = len(WORKER_NODES) +CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" +WORKER_SUFFIX = f"/{WORKER_TOTAL}" +CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring" + +# --------------------------------------------------------------------------- +# PromQL helpers +# --------------------------------------------------------------------------- + +NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")' -# --------------------------------------------------------------------------- # -# Panel helper factories -# --------------------------------------------------------------------------- # +def node_filter(regex): + """Return a selector that evaluates to 1 for nodes matching the regex.""" + return ( + f'label_replace(node_uname_info{{nodename=~"{regex}"}}, ' + '"node", "$1", "nodename", "(.*)")' + ) -def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None, - text_mode="value", legend=None): +def scoped_node_expr(base, scope=""): + """Attach nodename metadata and optionally filter to a scope regex.""" + expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})" + if scope: + expr = f"({expr}) * on(node) group_left() {node_filter(scope)}" + return expr + + +def node_cpu_expr(scope=""): + idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))' + base = f"(1 - {idle}) * 100" + return scoped_node_expr(base, scope) + + +def node_mem_expr(scope=""): + usage = ( + "avg by (instance) (" + "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) " + "/ node_memory_MemTotal_bytes * 100)" + ) + return scoped_node_expr(usage, scope) + + +def filesystem_usage_expr(mount, scope=""): + base = ( + f'avg by (instance) (' + f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} ' + f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)' + ) + return scoped_node_expr(base, scope) + + +def root_usage_expr(scope=""): + return filesystem_usage_expr("/", scope) + + +def astreae_usage_expr(mount): + return ( + f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / " + f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)" + ) + + +def astreae_free_expr(mount): + return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" + + +PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' +CRASHLOOP_EXPR = ( + 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' + '{reason=~"CrashLoopBackOff|ImagePullBackOff"}))' +) +STUCK_TERMINATING_EXPR = ( + 'sum(max by (namespace,pod) ((' + '(time() - kube_pod_deletion_timestamp{pod!=""}) > 600' + ') and on(namespace,pod) kube_pod_deletion_timestamp{pod!=""} > 0))' +) + +PROBLEM_TABLE_EXPR = ( + "(time() - kube_pod_created{pod!=\"\"}) " + "* on(namespace,pod) group_left(node) kube_pod_info " + "* on(namespace,pod) group_left(phase) " + "max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" +) +CRASHLOOP_TABLE_EXPR = ( + "(time() - kube_pod_created{pod!=\"\"}) " + "* on(namespace,pod) group_left(node) kube_pod_info " + "* on(namespace,pod,container) group_left(reason) " + "max by (namespace,pod,container,reason) " + "(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" +) +STUCK_TABLE_EXPR = ( + "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) " + "* on(namespace,pod) group_left(node) kube_pod_info) " + "and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0" +) + +NAMESPACE_CPU_EXPR = ( + 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=""' + ',pod!=""}[5m])) by (namespace))' +) +NAMESPACE_RAM_EXPR = ( + 'topk(10, sum(container_memory_working_set_bytes{namespace!=""' + ',pod!=""}) by (namespace))' +) +NET_SERIES_EXPR = ( + 'avg by (node) (' + 'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) ' + '+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))' +) +NET_TOP_EXPR = f"topk(1, {NET_SERIES_EXPR})" +IO_SERIES_EXPR = ( + "avg by (node) (rate(node_disk_read_bytes_total[5m]) " + "+ rate(node_disk_written_bytes_total[5m]))" +) +IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})" +NET_INGRESS_EXPR = ( + 'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) ' + "or on() vector(0)" +) +NET_EGRESS_EXPR = ( + 'sum(rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m])) ' + "or on() vector(0)" +) + +# --------------------------------------------------------------------------- +# Panel factories +# --------------------------------------------------------------------------- + + +def stat_panel( + panel_id, + title, + expr, + grid, + *, + unit="none", + thresholds=None, + text_mode="value", + legend=None, + value_suffix=None, + links=None, +): + """Return a Grafana stat panel definition.""" defaults = { "color": {"mode": "palette-classic"}, "mappings": [], @@ -48,7 +232,10 @@ def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None, ], }, "unit": unit, + "custom": {"displayMode": "auto"}, } + if value_suffix: + defaults["custom"]["valueSuffix"] = value_suffix panel = { "id": panel_id, "type": "stat", @@ -67,12 +254,26 @@ def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None, } if legend: panel["targets"][0]["legendFormat"] = legend + if links: + panel["links"] = links return panel -def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None, - legend_display="table", legend_placement="bottom", - legend_calcs=None, time_from=None): +def timeseries_panel( + panel_id, + title, + expr, + grid, + *, + unit="none", + legend=None, + legend_display="table", + legend_placement="bottom", + legend_calcs=None, + time_from=None, + links=None, +): + """Return a Grafana time-series panel definition.""" panel = { "id": panel_id, "type": "timeseries", @@ -95,11 +296,21 @@ def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None, panel["options"]["legend"]["calcs"] = legend_calcs if time_from: panel["timeFrom"] = time_from + if links: + panel["links"] = links return panel -def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=None, - description=None): +def table_panel( + panel_id, + title, + expr, + grid, + *, + unit="none", + transformations=None, +): + """Return a Grafana table panel definition.""" panel = { "id": panel_id, "type": "table", @@ -112,20 +323,25 @@ def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=Non } if transformations: panel["transformations"] = transformations - if description: - panel["description"] = description return panel def pie_panel(panel_id, title, expr, grid): + """Return a pie chart panel with readable namespace labels.""" return { "id": panel_id, "type": "piechart", "title": title, "datasource": PROM_DS, "gridPos": grid, - "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], - "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []}, + "targets": [{"expr": expr, "refId": "A"}], + "fieldConfig": { + "defaults": { + "unit": "percent", + "displayName": "{{namespace}}", + }, + "overrides": [], + }, "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", @@ -145,192 +361,238 @@ def text_panel(panel_id, title, content, grid): } -def node_cpu_expr(scope=""): - expr = "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))" - if scope: - expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}" - return expr +def link_to(uid): + return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}] -def node_mem_expr(scope=""): - expr = "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))" - if scope: - expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}" - return expr - - -def root_usage_expr(): - return "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)" - - -def astreae_usage_expr(mount): - return ( - f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / " - f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)" - ) - - -def astreae_free_expr(mount): - return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" +# --------------------------------------------------------------------------- +# Dashboard builders +# --------------------------------------------------------------------------- def build_overview(): - thresholds_percent = { - "mode": "percentage", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 70}, - {"color": "red", "value": 85}, - ], - } panels = [] - stats = [ - (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})'), - (2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})'), - (3, "Control plane ready", 'sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})'), - (4, "Control plane schedulable", 'sum(kube_node_spec_unschedulable{node=~"titan-0a|titan-0b|titan-0c"} == 0)'), - (5, "Problem pods", 'sum(kube_pod_status_phase{phase!~"Running|Succeeded"})'), - (6, "Stuck terminating", 'sum(((time() - kube_pod_deletion_timestamp) > 600))'), + + row1_stats = [ + (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None), + ( + 2, + "Ready nodes", + f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', + WORKER_SUFFIX, + WORKER_TOTAL, + None, + ), + ( + 3, + "Control plane ready", + f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', + CONTROL_SUFFIX, + CONTROL_TOTAL, + None, + ), + ( + 4, + "Control plane workloads", + f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', + None, + 1, + link_to("atlas-pods"), + ), + ( + 5, + "Problem pods", + PROBLEM_PODS_EXPR, + None, + 1, + link_to("atlas-pods"), + ), + ( + 6, + "Stuck terminating", + STUCK_TERMINATING_EXPR, + None, + 1, + link_to("atlas-pods"), + ), ] - for idx, (panel_id, title, expr) in enumerate(stats): + for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): + thresholds = None + if panel_id in (2, 3): + thresholds = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "green", "value": ok_value}, + ], + } + elif panel_id >= 4: + thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + } panels.append( stat_panel( panel_id, title, expr, {"h": 5, "w": 4, "x": 4 * idx, "y": 0}, + value_suffix=suffix, + thresholds=thresholds, + links=links, ) ) - panels.append( - stat_panel( - 7, - "Hottest node: CPU", - node_cpu_expr(), - {"h": 5, "w": 4, "x": 24, "y": 0}, - unit="percent", - thresholds=thresholds_percent, - text_mode="value_and_name", - legend="{{node}}", - ) - ) - panels.append( - stat_panel( - 8, - "Hottest node: RAM", - node_mem_expr(), - {"h": 5, "w": 4, "x": 28, "y": 0}, - unit="percent", - thresholds=thresholds_percent, - text_mode="value_and_name", - legend="{{node}}", - ) - ) - panels.append(pie_panel(9, "Namespace CPU share", 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 5})) - panels.append(pie_panel(10, "Namespace RAM share", 'topk(10, sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace))', {"h": 9, "w": 12, "x": 12, "y": 5})) + hottest = [ + (7, "Hottest node: CPU", f"topk(1, {node_cpu_expr()})", "percent"), + (8, "Hottest node: RAM", f"topk(1, {node_mem_expr()})", "percent"), + (9, "Hottest node: NET", NET_TOP_EXPR, "bytes/sec"), + (10, "Hottest node: I/O", IO_TOP_EXPR, "bytes/sec"), + ] + for idx, (panel_id, title, expr, unit) in enumerate(hottest): + panels.append( + stat_panel( + panel_id, + title, + expr, + {"h": 5, "w": 6, "x": 6 * idx, "y": 5}, + unit=unit, + thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, + text_mode="value_and_name", + legend="{{node}}", + links=link_to("atlas-nodes"), + ) + ) + + panels.append( + pie_panel( + 11, + "Namespace CPU share", + NAMESPACE_CPU_EXPR, + {"h": 9, "w": 12, "x": 0, "y": 10}, + ) + ) + panels.append( + pie_panel( + 12, + "Namespace RAM share", + NAMESPACE_RAM_EXPR, + {"h": 9, "w": 12, "x": 12, "y": 10}, + ) + ) panels.append( timeseries_panel( - 11, + 13, "Cluster node CPU", node_cpu_expr(), - {"h": 8, "w": 12, "x": 0, "y": 14}, + {"h": 8, "w": 12, "x": 0, "y": 19}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", + links=link_to("atlas-nodes"), ) ) panels.append( timeseries_panel( - 12, + 14, "Cluster node RAM", node_mem_expr(), - {"h": 8, "w": 12, "x": 12, "y": 14}, + {"h": 8, "w": 12, "x": 12, "y": 19}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", - ) - ) - - panels.append( - table_panel( - 13, - "Problem pods (details)", - "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", - {"h": 8, "w": 12, "x": 0, "y": 22}, - unit="s", - transformations=[{"id": "labelsToFields", "options": {}}], - ) - ) - panels.append( - table_panel( - 14, - "Terminating >10m", - "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", - {"h": 8, "w": 12, "x": 12, "y": 22}, - unit="s", - transformations=[ - {"id": "labelsToFields", "options": {}} , - {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, - ], + links=link_to("atlas-nodes"), ) ) panels.append( timeseries_panel( 15, - "Control plane CPU", - node_cpu_expr("titan-0a|titan-0b|titan-0c"), - {"h": 7, "w": 12, "x": 0, "y": 30}, + "Control plane CPU (incl. titan-db)", + node_cpu_expr(CONTROL_ALL_REGEX), + {"h": 7, "w": 12, "x": 0, "y": 27}, unit="percent", legend="{{node}}", + legend_display="table", + legend_placement="right", ) ) panels.append( timeseries_panel( 16, - "Control plane RAM", - node_mem_expr("titan-0a|titan-0b|titan-0c"), - {"h": 7, "w": 12, "x": 12, "y": 30}, + "Control plane RAM (incl. titan-db)", + node_mem_expr(CONTROL_ALL_REGEX), + {"h": 7, "w": 12, "x": 12, "y": 27}, unit="percent", legend="{{node}}", + legend_display="table", + legend_placement="right", ) ) panels.append( timeseries_panel( 17, + "Cluster ingress throughput", + NET_INGRESS_EXPR, + {"h": 7, "w": 12, "x": 0, "y": 34}, + unit="bytes/sec", + legend_display="list", + legend_placement="bottom", + links=link_to("atlas-network"), + ) + ) + panels.append( + timeseries_panel( + 18, + "Cluster egress throughput", + NET_EGRESS_EXPR, + {"h": 7, "w": 12, "x": 12, "y": 34}, + unit="bytes/sec", + legend_display="list", + legend_placement="bottom", + links=link_to("atlas-network"), + ) + ) + + panels.append( + timeseries_panel( + 19, "Root filesystem usage", root_usage_expr(), - {"h": 8, "w": 12, "x": 0, "y": 37}, + {"h": 8, "w": 12, "x": 0, "y": 41}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", - time_from="7d", + time_from="30d", + links=link_to("atlas-storage"), ) ) - panels.append( { - "id": 18, + "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": PROM_DS, - "gridPos": {"h": 8, "w": 12, "x": 12, "y": 37}, - "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}], + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 41}, + "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A"}], "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, @@ -338,6 +600,7 @@ def build_overview(): {"color": "red", "value": 85}, ], }, + "displayName": "{{node}}", }, "overrides": [], }, @@ -346,143 +609,157 @@ def build_overview(): "orientation": "horizontal", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, + "links": link_to("atlas-storage"), } ) - panels.append( - stat_panel( - 19, - "Astreae usage", - astreae_usage_expr("/mnt/astreae"), - {"h": 6, "w": 6, "x": 0, "y": 45}, - unit="percent", - thresholds=thresholds_percent, + storage_panels = [ + (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), + (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), + (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "bytesSI"), + (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "bytesSI"), + ] + for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): + panels.append( + stat_panel( + panel_id, + title, + expr, + {"h": 6, "w": 6, "x": 6 * idx, "y": 49}, + unit=unit, + thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, + links=link_to("atlas-storage"), + ) ) - ) - panels.append( - stat_panel( - 20, - "Asteria usage", - astreae_usage_expr("/mnt/asteria"), - {"h": 6, "w": 6, "x": 6, "y": 45}, - unit="percent", - thresholds=thresholds_percent, - ) - ) - panels.append( - stat_panel( - 21, - "Astreae free", - astreae_free_expr("/mnt/astreae"), - {"h": 6, "w": 6, "x": 12, "y": 45}, - unit="bytesSI", - ) - ) - panels.append( - stat_panel( - 22, - "Asteria free", - astreae_free_expr("/mnt/asteria"), - {"h": 6, "w": 6, "x": 18, "y": 45}, - unit="bytesSI", - ) - ) - - panels.append( - table_panel( - 23, - "Astreae per-node usage", - '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', - {"h": 8, "w": 12, "x": 0, "y": 51}, - unit="percent", - transformations=[{"id": "labelsToFields", "options": {}}], - ) - ) - panels.append( - table_panel( - 24, - "Asteria per-node usage", - '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', - {"h": 8, "w": 12, "x": 12, "y": 51}, - unit="percent", - transformations=[{"id": "labelsToFields", "options": {}}], - ) - ) panels.append( text_panel( 25, "About this dashboard", - "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders", - {"h": 5, "w": 24, "x": 0, "y": 59}, + textwrap.dedent( + """\ +### Atlas Overview +- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs. +- Control plane workload count flags any non-system pods that slipped onto the HA nodes. +- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly.""" + ), + {"h": 5, "w": 24, "x": 0, "y": 55}, ) ) return { "uid": "atlas-overview", "title": "Atlas Overview", - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": {"type": "datasource", "uid": "grafana"}, - "enable": True, - "hide": True, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard", - } - ] - }, + "folderUid": PUBLIC_FOLDER, "editable": False, - "folderUid": "atlas-overview", - "graphTooltip": 0, - "links": [ - {"title": "Pods dashboard", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False}, - {"title": "Nodes dashboard", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False}, - {"title": "Storage dashboard", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False}, - ], + "annotations": {"list": []}, "panels": panels, - "refresh": "30s", "schemaVersion": 39, "style": "dark", "tags": ["atlas", "overview"], "templating": {"list": []}, "time": {"from": "now-12h", "to": "now"}, + "links": [ + {"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False}, + {"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False}, + {"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False}, + {"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False}, + ], } def build_pods_dashboard(): panels = [] panels.append( - table_panel( + stat_panel( 1, - "Pods not running", - "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", - {"h": 10, "w": 24, "x": 0, "y": 0}, - unit="s", - transformations=[{"id": "labelsToFields", "options": {}}], + "Problem pods", + PROBLEM_PODS_EXPR, + {"h": 4, "w": 6, "x": 0, "y": 0}, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + }, ) ) panels.append( - table_panel( + stat_panel( 2, "CrashLoop / ImagePull", - "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", - {"h": 10, "w": 24, "x": 0, "y": 10}, + CRASHLOOP_EXPR, + {"h": 4, "w": 6, "x": 6, "y": 0}, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + }, + ) + ) + panels.append( + stat_panel( + 3, + "Stuck terminating (>10m)", + STUCK_TERMINATING_EXPR, + {"h": 4, "w": 6, "x": 12, "y": 0}, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + }, + ) + ) + panels.append( + stat_panel( + 4, + "Control plane workloads", + f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', + {"h": 4, "w": 6, "x": 18, "y": 0}, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": 1}, + ], + }, + ) + ) + + panels.append( + table_panel( + 5, + "Pods not running", + PROBLEM_TABLE_EXPR, + {"h": 10, "w": 24, "x": 0, "y": 4}, unit="s", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( - 3, - "Terminating pods", - "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", - {"h": 10, "w": 24, "x": 0, "y": 20}, + 6, + "CrashLoop / ImagePull", + CRASHLOOP_TABLE_EXPR, + {"h": 10, "w": 24, "x": 0, "y": 14}, + unit="s", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + table_panel( + 7, + "Terminating >10m", + STUCK_TABLE_EXPR, + {"h": 10, "w": 24, "x": 0, "y": 24}, unit="s", transformations=[ - {"id": "labelsToFields", "options": {}} , + {"id": "labelsToFields", "options": {}}, {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, ], ) @@ -490,7 +767,7 @@ def build_pods_dashboard(): return { "uid": "atlas-pods", "title": "Atlas Pods", - "folderUid": "atlas-pods", + "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, @@ -503,17 +780,99 @@ def build_pods_dashboard(): def build_nodes_dashboard(): panels = [] - panels.append(stat_panel(1, "Node count", 'count(kube_node_info)', {"h": 5, "w": 6, "x": 0, "y": 0})) - panels.append(stat_panel(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})', {"h": 5, "w": 6, "x": 6, "y": 0})) - panels.append(stat_panel(3, "Control plane CPU avg", node_cpu_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name")) - panels.append(stat_panel(4, "Control plane RAM avg", node_mem_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name")) - panels.append(timeseries_panel(5, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right")) - panels.append(timeseries_panel(6, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 14}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right")) - panels.append(timeseries_panel(7, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 23}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="7d")) + panels.append( + stat_panel( + 1, + "Worker nodes ready", + f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', + {"h": 4, "w": 8, "x": 0, "y": 0}, + value_suffix=WORKER_SUFFIX, + ) + ) + panels.append( + stat_panel( + 2, + "Control plane ready", + f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', + {"h": 4, "w": 8, "x": 8, "y": 0}, + value_suffix=CONTROL_SUFFIX, + ) + ) + panels.append( + stat_panel( + 3, + "Control plane workloads", + f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', + {"h": 4, "w": 8, "x": 16, "y": 0}, + ) + ) + panels.append( + timeseries_panel( + 4, + "Node CPU", + node_cpu_expr(), + {"h": 9, "w": 24, "x": 0, "y": 4}, + unit="percent", + legend="{{node}}", + legend_calcs=["last"], + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 5, + "Node RAM", + node_mem_expr(), + {"h": 9, "w": 24, "x": 0, "y": 13}, + unit="percent", + legend="{{node}}", + legend_calcs=["last"], + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 6, + "Control plane (incl. titan-db) CPU", + node_cpu_expr(CONTROL_ALL_REGEX), + {"h": 9, "w": 12, "x": 0, "y": 22}, + unit="percent", + legend="{{node}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 7, + "Control plane (incl. titan-db) RAM", + node_mem_expr(CONTROL_ALL_REGEX), + {"h": 9, "w": 12, "x": 12, "y": 22}, + unit="percent", + legend="{{node}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 8, + "Root filesystem usage", + root_usage_expr(), + {"h": 9, "w": 24, "x": 0, "y": 31}, + unit="percent", + legend="{{node}}", + legend_display="table", + legend_placement="right", + time_from="30d", + ) + ) return { "uid": "atlas-nodes", "title": "Atlas Nodes", - "folderUid": "atlas-nodes", + "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, @@ -526,17 +885,94 @@ def build_nodes_dashboard(): def build_storage_dashboard(): panels = [] - panels.append(stat_panel(1, "Astreae usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent")) - panels.append(stat_panel(2, "Asteria usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent")) - panels.append(stat_panel(3, "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="bytesSI")) - panels.append(stat_panel(4, "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="bytesSI")) - panels.append(timeseries_panel(5, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d")) - panels.append(table_panel(6, "Astreae nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 0, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}])) - panels.append(table_panel(7, "Asteria nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 12, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}])) + panels.append( + stat_panel( + 1, + "Astreae usage", + astreae_usage_expr("/mnt/astreae"), + {"h": 5, "w": 6, "x": 0, "y": 0}, + unit="percent", + thresholds=PERCENT_THRESHOLDS, + ) + ) + panels.append( + stat_panel( + 2, + "Asteria usage", + astreae_usage_expr("/mnt/asteria"), + {"h": 5, "w": 6, "x": 6, "y": 0}, + unit="percent", + thresholds=PERCENT_THRESHOLDS, + ) + ) + panels.append( + stat_panel( + 3, + "Astreae free", + astreae_free_expr("/mnt/astreae"), + {"h": 5, "w": 6, "x": 12, "y": 0}, + unit="bytesSI", + ) + ) + panels.append( + stat_panel( + 4, + "Asteria free", + astreae_free_expr("/mnt/asteria"), + {"h": 5, "w": 6, "x": 18, "y": 0}, + unit="bytesSI", + ) + ) + panels.append( + timeseries_panel( + 5, + "Astreae per-node usage", + filesystem_usage_expr("/mnt/astreae"), + {"h": 9, "w": 12, "x": 0, "y": 5}, + unit="percent", + legend="{{node}}", + legend_display="table", + legend_placement="right", + time_from="30d", + ) + ) + panels.append( + timeseries_panel( + 6, + "Asteria per-node usage", + filesystem_usage_expr("/mnt/asteria"), + {"h": 9, "w": 12, "x": 12, "y": 5}, + unit="percent", + legend="{{node}}", + legend_display="table", + legend_placement="right", + time_from="30d", + ) + ) + panels.append( + timeseries_panel( + 7, + "Astreae usage history", + astreae_usage_expr("/mnt/astreae"), + {"h": 9, "w": 12, "x": 0, "y": 14}, + unit="percent", + time_from="90d", + ) + ) + panels.append( + timeseries_panel( + 8, + "Asteria usage history", + astreae_usage_expr("/mnt/asteria"), + {"h": 9, "w": 12, "x": 12, "y": 14}, + unit="percent", + time_from="90d", + ) + ) return { "uid": "atlas-storage", "title": "Atlas Storage", - "folderUid": "atlas-storage", + "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, @@ -547,6 +983,95 @@ def build_storage_dashboard(): } +def build_network_dashboard(): + panels = [] + panels.append( + stat_panel(1, "Ingress bytes/s", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="bytes/sec") + ) + panels.append( + stat_panel(2, "Egress bytes/s", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="bytes/sec") + ) + panels.append( + stat_panel( + 3, + "Top router req/s", + 'max(topk(1, rate(traefik_router_requests_total[5m])))', + {"h": 4, "w": 8, "x": 16, "y": 0}, + unit="req/s", + ) + ) + panels.append( + timeseries_panel( + 4, + "Per-node throughput", + NET_SERIES_EXPR, + {"h": 8, "w": 24, "x": 0, "y": 4}, + unit="bytes/sec", + legend="{{node}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + table_panel( + 5, + "Top namespaces", + 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' + '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', + {"h": 9, "w": 12, "x": 0, "y": 12}, + unit="bytes/sec", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + table_panel( + 6, + "Top pods", + 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' + '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', + {"h": 9, "w": 12, "x": 12, "y": 12}, + unit="bytes/sec", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + timeseries_panel( + 7, + "Traefik routers (req/s)", + 'topk(10, rate(traefik_router_requests_total[5m]))', + {"h": 9, "w": 12, "x": 0, "y": 21}, + unit="req/s", + legend="{{router}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 8, + "Traefik entrypoints (req/s)", + 'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))', + {"h": 9, "w": 12, "x": 12, "y": 21}, + unit="req/s", + legend="{{entrypoint}}", + legend_display="table", + legend_placement="right", + ) + ) + return { + "uid": "atlas-network", + "title": "Atlas Network", + "folderUid": PRIVATE_FOLDER, + "editable": True, + "panels": panels, + "time": {"from": "now-12h", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "network"], + } + + DASHBOARDS = { "atlas-overview": { "builder": build_overview, @@ -564,20 +1089,24 @@ DASHBOARDS = { "builder": build_storage_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml", }, + "atlas-network": { + "builder": build_network_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml", + }, } -def write_json(uid: str, data: dict) -> None: +def write_json(uid, data): DASHBOARD_DIR.mkdir(parents=True, exist_ok=True) path = DASHBOARD_DIR / f"{uid}.json" path.write_text(json.dumps(data, indent=2) + "\n") -def render_configmap(uid: str, data: dict) -> None: +def render_configmap(uid, info): json_path = DASHBOARD_DIR / f"{uid}.json" payload = json.dumps(json.loads(json_path.read_text()), indent=2) indented = "\n".join(" " + line for line in payload.splitlines()) - output_path = data["configmap"] + output_path = info["configmap"] content = CONFIG_TEMPLATE.format( relative_path=output_path.relative_to(ROOT), name=output_path.stem, diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json new file mode 100644 index 0000000..3846d2a --- /dev/null +++ b/services/monitoring/dashboards/atlas-network.json @@ -0,0 +1,384 @@ +{ + "uid": "atlas-network", + "title": "Atlas Network", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Ingress bytes/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Egress bytes/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 0 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Top router req/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 0 + }, + "targets": [ + { + "expr": "max(topk(1, rate(traefik_router_requests_total[5m])))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "req/s", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "timeseries", + "title": "Per-node throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 4 + }, + "targets": [ + { + "expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 5, + "type": "table", + "title": "Top namespaces", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 6, + "type": "table", + "title": "Top pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 12 + }, + "targets": [ + { + "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 7, + "type": "timeseries", + "title": "Traefik routers (req/s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 21 + }, + "targets": [ + { + "expr": "topk(10, rate(traefik_router_requests_total[5m]))", + "refId": "A", + "legendFormat": "{{router}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "req/s" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 8, + "type": "timeseries", + "title": "Traefik entrypoints (req/s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 21 + }, + "targets": [ + { + "expr": "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))", + "refId": "A", + "legendFormat": "{{entrypoint}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "req/s" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "network" + ] +} diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index d3393a9..e974d8a 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -1,26 +1,26 @@ { "uid": "atlas-nodes", "title": "Atlas Nodes", - "folderUid": "atlas-nodes", + "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, "type": "stat", - "title": "Node count", + "title": "Worker nodes ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 6, + "h": 4, + "w": 8, "x": 0, "y": 0 }, "targets": [ { - "expr": "count(kube_node_info)", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -43,7 +43,11 @@ } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/18" + } }, "overrides": [] }, @@ -64,20 +68,20 @@ { "id": 2, "type": "stat", - "title": "Ready nodes", + "title": "Control plane ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 6, - "x": 6, + "h": 4, + "w": 8, + "x": 8, "y": 0 }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})", "refId": "A" } ], @@ -100,7 +104,11 @@ } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/3" + } }, "overrides": [] }, @@ -121,22 +129,21 @@ { "id": 3, "type": "stat", - "title": "Control plane CPU avg", + "title": "Control plane workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 6, - "x": 12, + "h": 4, + "w": 8, + "x": 16, "y": 0 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", - "refId": "A", - "legendFormat": "{{node}}" + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "refId": "A" } ], "fieldConfig": { @@ -158,7 +165,10 @@ } ] }, - "unit": "percent" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -173,69 +183,11 @@ "fields": "", "values": false }, - "textMode": "value_and_name" + "textMode": "value" } }, { "id": 4, - "type": "stat", - "title": "Control plane RAM avg", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 6, - "x": 18, - "y": 0 - }, - "targets": [ - { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value_and_name" - } - }, - { - "id": 5, "type": "timeseries", "title": "Node CPU", "datasource": { @@ -246,11 +198,51 @@ "h": 9, "w": 24, "x": 0, - "y": 5 + "y": 4 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 5, + "type": "timeseries", + "title": "Node RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 13 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -277,20 +269,20 @@ { "id": 6, "type": "timeseries", - "title": "Node RAM", + "title": "Control plane (incl. titan-db) CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, - "y": 14 + "y": 22 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -304,10 +296,7 @@ "options": { "legend": { "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] + "placement": "right" }, "tooltip": { "mode": "multi" @@ -317,7 +306,44 @@ { "id": 7, "type": "timeseries", - "title": "Root filesystem", + "title": "Control plane (incl. titan-db) RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 22 + }, + "targets": [ + { + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 8, + "type": "timeseries", + "title": "Root filesystem usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -326,11 +352,11 @@ "h": 9, "w": 24, "x": 0, - "y": 23 + "y": 31 }, "targets": [ { - "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -350,7 +376,7 @@ "mode": "multi" } }, - "timeFrom": "7d" + "timeFrom": "30d" } ], "time": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index d7a0d27..3377a13 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1,45 +1,11 @@ { "uid": "atlas-overview", "title": "Atlas Overview", - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": false, "folderUid": "atlas-overview", - "graphTooltip": 0, - "links": [ - { - "title": "Pods dashboard", - "type": "dashboard", - "dashboardUid": "atlas-pods", - "keepTime": false - }, - { - "title": "Nodes dashboard", - "type": "dashboard", - "dashboardUid": "atlas-nodes", - "keepTime": false - }, - { - "title": "Storage dashboard", - "type": "dashboard", - "dashboardUid": "atlas-storage", - "keepTime": false - } - ], + "editable": false, + "annotations": { + "list": [] + }, "panels": [ { "id": 1, @@ -80,7 +46,10 @@ } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -114,7 +83,7 @@ }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -128,16 +97,20 @@ "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "red", "value": null }, { "color": "green", - "value": 1 + "value": 18 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/18" + } }, "overrides": [] }, @@ -185,16 +158,20 @@ "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "red", "value": null }, { "color": "green", - "value": 1 + "value": 3 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/3" + } }, "overrides": [] }, @@ -215,7 +192,7 @@ { "id": 4, "type": "stat", - "title": "Control plane schedulable", + "title": "Control plane workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -228,7 +205,7 @@ }, "targets": [ { - "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", "refId": "A" } ], @@ -242,16 +219,19 @@ "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", + "color": "red", "value": 1 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -267,7 +247,14 @@ "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] }, { "id": 5, @@ -285,7 +272,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", "refId": "A" } ], @@ -299,16 +286,19 @@ "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", + "color": "red", "value": 1 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -324,7 +314,14 @@ "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] }, { "id": 6, @@ -342,10 +339,222 @@ }, "targets": [ { - "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", "refId": "A" } ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] + }, + { + "id": 7, + "type": "stat", + "title": "Hottest node: CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 5 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 8, + "type": "stat", + "title": "Hottest node: RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 5 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 9, + "type": "stat", + "title": "Hottest node: NET", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 5 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], "fieldConfig": { "defaults": { "color": { @@ -365,69 +574,10 @@ } ] }, - "unit": "none" - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 7, - "type": "stat", - "title": "Hottest node: CPU", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 24, - "y": 0 - }, - "targets": [ - { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "percentage", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 70 - }, - { - "color": "red", - "value": 85 - } - ] - }, - "unit": "percent" + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -443,25 +593,32 @@ "values": false }, "textMode": "value_and_name" - } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] }, { - "id": 8, + "id": 10, "type": "stat", - "title": "Hottest node: RAM", + "title": "Hottest node: I/O", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, - "w": 4, - "x": 28, - "y": 0 + "w": 6, + "x": 18, + "y": 5 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", "legendFormat": "{{node}}" } @@ -473,23 +630,22 @@ }, "mappings": [], "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", - "value": 70 - }, - { - "color": "red", - "value": 85 + "color": "green", + "value": 1 } ] }, - "unit": "percent" + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -505,10 +661,17 @@ "values": false }, "textMode": "value_and_name" - } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] }, { - "id": 9, + "id": 11, "type": "piechart", "title": "Namespace CPU share", "datasource": { @@ -519,18 +682,18 @@ "h": 9, "w": 12, "x": 0, - "y": 5 + "y": 10 }, "targets": [ { - "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", - "refId": "A", - "legendFormat": "{{namespace}}" + "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "displayName": "{{namespace}}" }, "overrides": [] }, @@ -550,7 +713,7 @@ } }, { - "id": 10, + "id": 12, "type": "piechart", "title": "Namespace RAM share", "datasource": { @@ -561,18 +724,18 @@ "h": 9, "w": 12, "x": 12, - "y": 5 + "y": 10 }, "targets": [ { - "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", - "refId": "A", - "legendFormat": "{{namespace}}" + "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "displayName": "{{namespace}}" }, "overrides": [] }, @@ -592,7 +755,7 @@ } }, { - "id": 11, + "id": 13, "type": "timeseries", "title": "Cluster node CPU", "datasource": { @@ -603,11 +766,11 @@ "h": 8, "w": 12, "x": 0, - "y": 14 + "y": 19 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -629,10 +792,17 @@ "tooltip": { "mode": "multi" } - } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] }, { - "id": 12, + "id": 14, "type": "timeseries", "title": "Cluster node RAM", "datasource": { @@ -643,11 +813,11 @@ "h": 8, "w": 12, "x": 12, - "y": 14 + "y": 19 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -669,92 +839,19 @@ "tooltip": { "mode": "multi" } - } - }, - { - "id": 13, - "type": "table", - "title": "Problem pods (details)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 22 - }, - "targets": [ + "links": [ { - "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "s" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } - ] - }, - { - "id": 14, - "type": "table", - "title": "Terminating >10m", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 22 - }, - "targets": [ - { - "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "s" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "filterByValue", - "options": { - "match": "Value", - "operator": "gt", - "value": 600 - } + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true } ] }, { "id": 15, "type": "timeseries", - "title": "Control plane CPU", + "title": "Control plane CPU (incl. titan-db)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -763,11 +860,11 @@ "h": 7, "w": 12, "x": 0, - "y": 30 + "y": 27 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -781,7 +878,7 @@ "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right" }, "tooltip": { "mode": "multi" @@ -791,7 +888,7 @@ { "id": 16, "type": "timeseries", - "title": "Control plane RAM", + "title": "Control plane RAM (incl. titan-db)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -800,11 +897,11 @@ "h": 7, "w": 12, "x": 12, - "y": 30 + "y": 27 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -818,7 +915,7 @@ "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right" }, "tooltip": { "mode": "multi" @@ -828,6 +925,92 @@ { "id": 17, "type": "timeseries", + "title": "Cluster ingress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 34 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 18, + "type": "timeseries", + "title": "Cluster egress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 34 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 19, + "type": "timeseries", "title": "Root filesystem usage", "datasource": { "type": "prometheus", @@ -837,11 +1020,11 @@ "h": 8, "w": 12, "x": 0, - "y": 37 + "y": 41 }, "targets": [ { - "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -864,10 +1047,17 @@ "mode": "multi" } }, - "timeFrom": "7d" + "timeFrom": "30d", + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 18, + "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -878,13 +1068,12 @@ "h": 8, "w": 12, "x": 12, - "y": 37 + "y": 41 }, "targets": [ { - "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))", - "refId": "A", - "legendFormat": "{{node}}" + "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A" } ], "fieldConfig": { @@ -893,7 +1082,7 @@ "min": 0, "max": 100, "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { "color": "green", @@ -912,7 +1101,8 @@ "value": 85 } ] - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -926,10 +1116,17 @@ "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 19, + "id": 21, "type": "stat", "title": "Astreae usage", "datasource": { @@ -940,7 +1137,7 @@ "h": 6, "w": 6, "x": 0, - "y": 45 + "y": 49 }, "targets": [ { @@ -971,7 +1168,10 @@ } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -987,10 +1187,17 @@ "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 20, + "id": 22, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1001,7 +1208,7 @@ "h": 6, "w": 6, "x": 6, - "y": 45 + "y": 49 }, "targets": [ { @@ -1032,7 +1239,10 @@ } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -1048,10 +1258,17 @@ "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 21, + "id": 23, "type": "stat", "title": "Astreae free", "datasource": { @@ -1062,7 +1279,7 @@ "h": 6, "w": 6, "x": 12, - "y": 45 + "y": 49 }, "targets": [ { @@ -1089,7 +1306,10 @@ } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -1105,10 +1325,17 @@ "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 22, + "id": 24, "type": "stat", "title": "Asteria free", "datasource": { @@ -1119,7 +1346,7 @@ "h": 6, "w": 6, "x": 18, - "y": 45 + "y": 49 }, "targets": [ { @@ -1146,7 +1373,10 @@ } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -1162,77 +1392,12 @@ "values": false }, "textMode": "value" - } - }, - { - "id": 23, - "type": "table", - "title": "Astreae per-node usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 51 - }, - "targets": [ + "links": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } - ] - }, - { - "id": 24, - "type": "table", - "title": "Asteria per-node usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 51 - }, - "targets": [ - { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true } ] }, @@ -1244,16 +1409,15 @@ "h": 5, "w": 24, "x": 0, - "y": 59 + "y": 55 }, "datasource": null, "options": { "mode": "markdown", - "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders" + "content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly." } } ], - "refresh": "30s", "schemaVersion": 39, "style": "dark", "tags": [ @@ -1266,5 +1430,31 @@ "time": { "from": "now-12h", "to": "now" - } + }, + "links": [ + { + "title": "Atlas Pods", + "type": "dashboard", + "dashboardUid": "atlas-pods", + "keepTime": false + }, + { + "title": "Atlas Nodes", + "type": "dashboard", + "dashboardUid": "atlas-nodes", + "keepTime": false + }, + { + "title": "Atlas Storage", + "type": "dashboard", + "dashboardUid": "atlas-storage", + "keepTime": false + }, + { + "title": "Atlas Network", + "type": "dashboard", + "dashboardUid": "atlas-network", + "keepTime": false + } + ] } diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index 91f80eb..3e7dd0e 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -1,11 +1,251 @@ { "uid": "atlas-pods", "title": "Atlas Pods", - "folderUid": "atlas-pods", + "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, + "type": "stat", + "title": "Problem pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "CrashLoop / ImagePull", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Stuck terminating (>10m)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Control plane workloads", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, "type": "table", "title": "Pods not running", "datasource": { @@ -16,11 +256,11 @@ "h": 10, "w": 24, "x": 0, - "y": 0 + "y": 4 }, "targets": [ { - "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", "refId": "A" } ], @@ -41,7 +281,7 @@ ] }, { - "id": 2, + "id": 6, "type": "table", "title": "CrashLoop / ImagePull", "datasource": { @@ -52,11 +292,11 @@ "h": 10, "w": 24, "x": 0, - "y": 10 + "y": 14 }, "targets": [ { - "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", + "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", "refId": "A" } ], @@ -77,9 +317,9 @@ ] }, { - "id": 3, + "id": 7, "type": "table", - "title": "Terminating pods", + "title": "Terminating >10m", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -88,11 +328,11 @@ "h": 10, "w": 24, "x": 0, - "y": 20 + "y": 24 }, "targets": [ { - "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + "expr": "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json index aa1948d..bb7d152 100644 --- a/services/monitoring/dashboards/atlas-storage.json +++ b/services/monitoring/dashboards/atlas-storage.json @@ -1,7 +1,7 @@ { "uid": "atlas-storage", "title": "Atlas Storage", - "folderUid": "atlas-storage", + "folderUid": "atlas-internal", "editable": true, "panels": [ { @@ -31,19 +31,26 @@ }, "mappings": [], "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -88,19 +95,26 @@ }, "mappings": [], "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -157,7 +171,10 @@ } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -214,7 +231,10 @@ } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -235,20 +255,20 @@ { "id": 5, "type": "timeseries", - "title": "Root filesystem", + "title": "Astreae per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, "y": 5 }, "targets": [ { - "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -272,21 +292,59 @@ }, { "id": 6, - "type": "table", - "title": "Astreae nodes", + "type": "timeseries", + "title": "Asteria per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 10, + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d" + }, + { + "id": 7, + "type": "timeseries", + "title": "Astreae usage history", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, "w": 12, "x": 0, "y": 14 }, "targets": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], @@ -297,32 +355,33 @@ "overrides": [] }, "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" } - ] + }, + "timeFrom": "90d" }, { - "id": 7, - "type": "table", - "title": "Asteria nodes", + "id": 8, + "type": "timeseries", + "title": "Asteria usage history", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 10, + "h": 9, "w": 12, "x": 12, "y": 14 }, "targets": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], @@ -333,14 +392,15 @@ "overrides": [] }, "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" } - ] + }, + "timeFrom": "90d" } ], "time": { diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml new file mode 100644 index 0000000..e1ba054 --- /dev/null +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -0,0 +1,393 @@ +# services/monitoring/grafana-dashboard-network.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-network + labels: + grafana_dashboard: "1" +data: + atlas-network.json: | + { + "uid": "atlas-network", + "title": "Atlas Network", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Ingress bytes/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Egress bytes/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 0 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Top router req/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 0 + }, + "targets": [ + { + "expr": "max(topk(1, rate(traefik_router_requests_total[5m])))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "req/s", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "timeseries", + "title": "Per-node throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 4 + }, + "targets": [ + { + "expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 5, + "type": "table", + "title": "Top namespaces", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]) + rate(container_network_receive_bytes_total{namespace!=\"\"}[5m])) by (namespace))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 6, + "type": "table", + "title": "Top pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 12 + }, + "targets": [ + { + "expr": "topk(10, sum(rate(container_network_transmit_bytes_total{pod!=\"\"}[5m]) + rate(container_network_receive_bytes_total{pod!=\"\"}[5m])) by (namespace,pod))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 7, + "type": "timeseries", + "title": "Traefik routers (req/s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 21 + }, + "targets": [ + { + "expr": "topk(10, rate(traefik_router_requests_total[5m]))", + "refId": "A", + "legendFormat": "{{router}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "req/s" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 8, + "type": "timeseries", + "title": "Traefik entrypoints (req/s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 21 + }, + "targets": [ + { + "expr": "sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))", + "refId": "A", + "legendFormat": "{{entrypoint}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "req/s" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "network" + ] + } diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index 516f207..afbeb3c 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -10,26 +10,26 @@ data: { "uid": "atlas-nodes", "title": "Atlas Nodes", - "folderUid": "atlas-nodes", + "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, "type": "stat", - "title": "Node count", + "title": "Worker nodes ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 6, + "h": 4, + "w": 8, "x": 0, "y": 0 }, "targets": [ { - "expr": "count(kube_node_info)", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -52,7 +52,11 @@ data: } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/18" + } }, "overrides": [] }, @@ -73,20 +77,20 @@ data: { "id": 2, "type": "stat", - "title": "Ready nodes", + "title": "Control plane ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 6, - "x": 6, + "h": 4, + "w": 8, + "x": 8, "y": 0 }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})", "refId": "A" } ], @@ -109,7 +113,11 @@ data: } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/3" + } }, "overrides": [] }, @@ -130,22 +138,21 @@ data: { "id": 3, "type": "stat", - "title": "Control plane CPU avg", + "title": "Control plane workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 5, - "w": 6, - "x": 12, + "h": 4, + "w": 8, + "x": 16, "y": 0 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", - "refId": "A", - "legendFormat": "{{node}}" + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "refId": "A" } ], "fieldConfig": { @@ -167,7 +174,10 @@ data: } ] }, - "unit": "percent" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -182,69 +192,11 @@ data: "fields": "", "values": false }, - "textMode": "value_and_name" + "textMode": "value" } }, { "id": 4, - "type": "stat", - "title": "Control plane RAM avg", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 6, - "x": 18, - "y": 0 - }, - "targets": [ - { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value_and_name" - } - }, - { - "id": 5, "type": "timeseries", "title": "Node CPU", "datasource": { @@ -255,11 +207,51 @@ data: "h": 9, "w": 24, "x": 0, - "y": 5 + "y": 4 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 5, + "type": "timeseries", + "title": "Node RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 13 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -286,20 +278,20 @@ data: { "id": 6, "type": "timeseries", - "title": "Node RAM", + "title": "Control plane (incl. titan-db) CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, - "y": 14 + "y": 22 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -313,10 +305,7 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] + "placement": "right" }, "tooltip": { "mode": "multi" @@ -326,7 +315,44 @@ data: { "id": 7, "type": "timeseries", - "title": "Root filesystem", + "title": "Control plane (incl. titan-db) RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 22 + }, + "targets": [ + { + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 8, + "type": "timeseries", + "title": "Root filesystem usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -335,11 +361,11 @@ data: "h": 9, "w": 24, "x": 0, - "y": 23 + "y": 31 }, "targets": [ { - "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -359,7 +385,7 @@ data: "mode": "multi" } }, - "timeFrom": "7d" + "timeFrom": "30d" } ], "time": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index a20e05a..199dfb2 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -10,45 +10,11 @@ data: { "uid": "atlas-overview", "title": "Atlas Overview", - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": false, "folderUid": "atlas-overview", - "graphTooltip": 0, - "links": [ - { - "title": "Pods dashboard", - "type": "dashboard", - "dashboardUid": "atlas-pods", - "keepTime": false - }, - { - "title": "Nodes dashboard", - "type": "dashboard", - "dashboardUid": "atlas-nodes", - "keepTime": false - }, - { - "title": "Storage dashboard", - "type": "dashboard", - "dashboardUid": "atlas-storage", - "keepTime": false - } - ], + "editable": false, + "annotations": { + "list": [] + }, "panels": [ { "id": 1, @@ -89,7 +55,10 @@ data: } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -123,7 +92,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], @@ -137,16 +106,20 @@ data: "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "red", "value": null }, { "color": "green", - "value": 1 + "value": 18 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/18" + } }, "overrides": [] }, @@ -194,16 +167,20 @@ data: "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "red", "value": null }, { "color": "green", - "value": 1 + "value": 3 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": "/3" + } }, "overrides": [] }, @@ -224,7 +201,7 @@ data: { "id": 4, "type": "stat", - "title": "Control plane schedulable", + "title": "Control plane workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -237,7 +214,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", "refId": "A" } ], @@ -251,16 +228,19 @@ data: "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", + "color": "red", "value": 1 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -276,7 +256,14 @@ data: "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] }, { "id": 5, @@ -294,7 +281,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", "refId": "A" } ], @@ -308,16 +295,19 @@ data: "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", + "color": "red", "value": 1 } ] }, - "unit": "none" + "unit": "none", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -333,7 +323,14 @@ data: "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] }, { "id": 6, @@ -351,10 +348,222 @@ data: }, "targets": [ { - "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", "refId": "A" } ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] + }, + { + "id": 7, + "type": "stat", + "title": "Hottest node: CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 5 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 8, + "type": "stat", + "title": "Hottest node: RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 5 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 9, + "type": "stat", + "title": "Hottest node: NET", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 5 + }, + "targets": [ + { + "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], "fieldConfig": { "defaults": { "color": { @@ -374,69 +583,10 @@ data: } ] }, - "unit": "none" - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 7, - "type": "stat", - "title": "Hottest node: CPU", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 24, - "y": 0 - }, - "targets": [ - { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "percentage", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 70 - }, - { - "color": "red", - "value": 85 - } - ] - }, - "unit": "percent" + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -452,25 +602,32 @@ data: "values": false }, "textMode": "value_and_name" - } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] }, { - "id": 8, + "id": 10, "type": "stat", - "title": "Hottest node: RAM", + "title": "Hottest node: I/O", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 5, - "w": 4, - "x": 28, - "y": 0 + "w": 6, + "x": 18, + "y": 5 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", "legendFormat": "{{node}}" } @@ -482,23 +639,22 @@ data: }, "mappings": [], "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", - "value": 70 - }, - { - "color": "red", - "value": 85 + "color": "green", + "value": 1 } ] }, - "unit": "percent" + "unit": "bytes/sec", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -514,10 +670,17 @@ data: "values": false }, "textMode": "value_and_name" - } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] }, { - "id": 9, + "id": 11, "type": "piechart", "title": "Namespace CPU share", "datasource": { @@ -528,18 +691,18 @@ data: "h": 9, "w": 12, "x": 0, - "y": 5 + "y": 10 }, "targets": [ { - "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", - "refId": "A", - "legendFormat": "{{namespace}}" + "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "displayName": "{{namespace}}" }, "overrides": [] }, @@ -559,7 +722,7 @@ data: } }, { - "id": 10, + "id": 12, "type": "piechart", "title": "Namespace RAM share", "datasource": { @@ -570,18 +733,18 @@ data: "h": 9, "w": 12, "x": 12, - "y": 5 + "y": 10 }, "targets": [ { - "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", - "refId": "A", - "legendFormat": "{{namespace}}" + "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "displayName": "{{namespace}}" }, "overrides": [] }, @@ -601,7 +764,7 @@ data: } }, { - "id": 11, + "id": 13, "type": "timeseries", "title": "Cluster node CPU", "datasource": { @@ -612,11 +775,11 @@ data: "h": 8, "w": 12, "x": 0, - "y": 14 + "y": 19 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -638,10 +801,17 @@ data: "tooltip": { "mode": "multi" } - } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] }, { - "id": 12, + "id": 14, "type": "timeseries", "title": "Cluster node RAM", "datasource": { @@ -652,11 +822,11 @@ data: "h": 8, "w": 12, "x": 12, - "y": 14 + "y": 19 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -678,92 +848,19 @@ data: "tooltip": { "mode": "multi" } - } - }, - { - "id": 13, - "type": "table", - "title": "Problem pods (details)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 22 - }, - "targets": [ + "links": [ { - "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "s" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } - ] - }, - { - "id": 14, - "type": "table", - "title": "Terminating >10m", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 22 - }, - "targets": [ - { - "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "s" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "filterByValue", - "options": { - "match": "Value", - "operator": "gt", - "value": 600 - } + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true } ] }, { "id": 15, "type": "timeseries", - "title": "Control plane CPU", + "title": "Control plane CPU (incl. titan-db)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -772,11 +869,11 @@ data: "h": 7, "w": 12, "x": 0, - "y": 30 + "y": 27 }, "targets": [ { - "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -790,7 +887,7 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right" }, "tooltip": { "mode": "multi" @@ -800,7 +897,7 @@ data: { "id": 16, "type": "timeseries", - "title": "Control plane RAM", + "title": "Control plane RAM (incl. titan-db)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -809,11 +906,11 @@ data: "h": 7, "w": 12, "x": 12, - "y": 30 + "y": 27 }, "targets": [ { - "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -827,7 +924,7 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right" }, "tooltip": { "mode": "multi" @@ -837,6 +934,92 @@ data: { "id": 17, "type": "timeseries", + "title": "Cluster ingress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 34 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 18, + "type": "timeseries", + "title": "Cluster egress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 34 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes/sec" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 19, + "type": "timeseries", "title": "Root filesystem usage", "datasource": { "type": "prometheus", @@ -846,11 +1029,11 @@ data: "h": 8, "w": 12, "x": 0, - "y": 37 + "y": 41 }, "targets": [ { - "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -873,10 +1056,17 @@ data: "mode": "multi" } }, - "timeFrom": "7d" + "timeFrom": "30d", + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 18, + "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -887,13 +1077,12 @@ data: "h": 8, "w": 12, "x": 12, - "y": 37 + "y": 41 }, "targets": [ { - "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))", - "refId": "A", - "legendFormat": "{{node}}" + "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A" } ], "fieldConfig": { @@ -902,7 +1091,7 @@ data: "min": 0, "max": 100, "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { "color": "green", @@ -921,7 +1110,8 @@ data: "value": 85 } ] - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -935,10 +1125,17 @@ data: "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 19, + "id": 21, "type": "stat", "title": "Astreae usage", "datasource": { @@ -949,7 +1146,7 @@ data: "h": 6, "w": 6, "x": 0, - "y": 45 + "y": 49 }, "targets": [ { @@ -980,7 +1177,10 @@ data: } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -996,10 +1196,17 @@ data: "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 20, + "id": 22, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1010,7 +1217,7 @@ data: "h": 6, "w": 6, "x": 6, - "y": 45 + "y": 49 }, "targets": [ { @@ -1041,7 +1248,10 @@ data: } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -1057,10 +1267,17 @@ data: "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 21, + "id": 23, "type": "stat", "title": "Astreae free", "datasource": { @@ -1071,7 +1288,7 @@ data: "h": 6, "w": 6, "x": 12, - "y": 45 + "y": 49 }, "targets": [ { @@ -1098,7 +1315,10 @@ data: } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -1114,10 +1334,17 @@ data: "values": false }, "textMode": "value" - } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] }, { - "id": 22, + "id": 24, "type": "stat", "title": "Asteria free", "datasource": { @@ -1128,7 +1355,7 @@ data: "h": 6, "w": 6, "x": 18, - "y": 45 + "y": 49 }, "targets": [ { @@ -1155,7 +1382,10 @@ data: } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -1171,77 +1401,12 @@ data: "values": false }, "textMode": "value" - } - }, - { - "id": 23, - "type": "table", - "title": "Astreae per-node usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 51 - }, - "targets": [ + "links": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } - ] - }, - { - "id": 24, - "type": "table", - "title": "Asteria per-node usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 51 - }, - "targets": [ - { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true } ] }, @@ -1253,16 +1418,15 @@ data: "h": 5, "w": 24, "x": 0, - "y": 59 + "y": 55 }, "datasource": null, "options": { "mode": "markdown", - "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders" + "content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly." } } ], - "refresh": "30s", "schemaVersion": 39, "style": "dark", "tags": [ @@ -1275,5 +1439,31 @@ data: "time": { "from": "now-12h", "to": "now" - } + }, + "links": [ + { + "title": "Atlas Pods", + "type": "dashboard", + "dashboardUid": "atlas-pods", + "keepTime": false + }, + { + "title": "Atlas Nodes", + "type": "dashboard", + "dashboardUid": "atlas-nodes", + "keepTime": false + }, + { + "title": "Atlas Storage", + "type": "dashboard", + "dashboardUid": "atlas-storage", + "keepTime": false + }, + { + "title": "Atlas Network", + "type": "dashboard", + "dashboardUid": "atlas-network", + "keepTime": false + } + ] } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 3b1f5da..58cae77 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -10,11 +10,251 @@ data: { "uid": "atlas-pods", "title": "Atlas Pods", - "folderUid": "atlas-pods", + "folderUid": "atlas-internal", "editable": true, "panels": [ { "id": 1, + "type": "stat", + "title": "Problem pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (kube_pod_status_phase{phase!~\"Running|Succeeded\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "CrashLoop / ImagePull", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Stuck terminating (>10m)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Control plane workloads", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, "type": "table", "title": "Pods not running", "datasource": { @@ -25,11 +265,11 @@ data: "h": 10, "w": 24, "x": 0, - "y": 0 + "y": 4 }, "targets": [ { - "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", "refId": "A" } ], @@ -50,7 +290,7 @@ data: ] }, { - "id": 2, + "id": 6, "type": "table", "title": "CrashLoop / ImagePull", "datasource": { @@ -61,11 +301,11 @@ data: "h": 10, "w": 24, "x": 0, - "y": 10 + "y": 14 }, "targets": [ { - "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", + "expr": "(time() - kube_pod_created{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) max by (namespace,pod,container,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", "refId": "A" } ], @@ -86,9 +326,9 @@ data: ] }, { - "id": 3, + "id": 7, "type": "table", - "title": "Terminating pods", + "title": "Terminating >10m", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -97,11 +337,11 @@ data: "h": 10, "w": 24, "x": 0, - "y": 20 + "y": 24 }, "targets": [ { - "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + "expr": "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml index 5b22804..99439fb 100644 --- a/services/monitoring/grafana-dashboard-storage.yaml +++ b/services/monitoring/grafana-dashboard-storage.yaml @@ -10,7 +10,7 @@ data: { "uid": "atlas-storage", "title": "Atlas Storage", - "folderUid": "atlas-storage", + "folderUid": "atlas-internal", "editable": true, "panels": [ { @@ -40,19 +40,26 @@ data: }, "mappings": [], "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -97,19 +104,26 @@ data: }, "mappings": [], "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 } ] }, - "unit": "percent" + "unit": "percent", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -166,7 +180,10 @@ data: } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -223,7 +240,10 @@ data: } ] }, - "unit": "bytesSI" + "unit": "bytesSI", + "custom": { + "displayMode": "auto" + } }, "overrides": [] }, @@ -244,20 +264,20 @@ data: { "id": 5, "type": "timeseries", - "title": "Root filesystem", + "title": "Astreae per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, "y": 5 }, "targets": [ { - "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } @@ -281,21 +301,59 @@ data: }, { "id": 6, - "type": "table", - "title": "Astreae nodes", + "type": "timeseries", + "title": "Asteria per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 10, + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d" + }, + { + "id": 7, + "type": "timeseries", + "title": "Astreae usage history", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, "w": 12, "x": 0, "y": 14 }, "targets": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], @@ -306,32 +364,33 @@ data: "overrides": [] }, "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" } - ] + }, + "timeFrom": "90d" }, { - "id": 7, - "type": "table", - "title": "Asteria nodes", + "id": 8, + "type": "timeseries", + "title": "Asteria usage history", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 10, + "h": 9, "w": 12, "x": 12, "y": 14 }, "targets": [ { - "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], @@ -342,14 +401,15 @@ data: "overrides": [] }, "options": { - "showHeader": true - }, - "transformations": [ - { - "id": "labelsToFields", - "options": {} + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" } - ] + }, + "timeFrom": "90d" } ], "time": { diff --git a/services/monitoring/grafana-folders.yaml b/services/monitoring/grafana-folders.yaml index d390679..c52b4e1 100644 --- a/services/monitoring/grafana-folders.yaml +++ b/services/monitoring/grafana-folders.yaml @@ -19,22 +19,8 @@ data: permission: Edit - role: Admin permission: Admin - - uid: atlas-pods - title: Atlas Pods - permissions: - - role: Editor - permission: View - - role: Admin - permission: Admin - - uid: atlas-nodes - title: Atlas Nodes - permissions: - - role: Editor - permission: View - - role: Admin - permission: Admin - - uid: atlas-storage - title: Atlas Storage + - uid: atlas-internal + title: Atlas Internal permissions: - role: Editor permission: View diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index e23f903..58035b6 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -288,7 +288,7 @@ spec: path: /var/lib/grafana/dashboards/overview - name: pods orgId: 1 - folder: Atlas Pods + folder: Atlas Internal type: file disableDeletion: false editable: true @@ -296,7 +296,7 @@ spec: path: /var/lib/grafana/dashboards/pods - name: nodes orgId: 1 - folder: Atlas Nodes + folder: Atlas Internal type: file disableDeletion: false editable: true @@ -304,17 +304,26 @@ spec: path: /var/lib/grafana/dashboards/nodes - name: storage orgId: 1 - folder: Atlas Storage + folder: Atlas Internal type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards/storage + - name: network + orgId: 1 + folder: Atlas Internal + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/network dashboardsConfigMaps: overview: grafana-dashboard-overview pods: grafana-dashboard-pods nodes: grafana-dashboard-nodes storage: grafana-dashboard-storage + network: grafana-dashboard-network extraConfigmapMounts: - name: grafana-folders mountPath: /etc/grafana/provisioning/folders diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 282ee4f..76263c1 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -9,5 +9,6 @@ resources: - grafana-dashboard-pods.yaml - grafana-dashboard-nodes.yaml - grafana-dashboard-storage.yaml + - grafana-dashboard-network.yaml - grafana-folders.yaml - helmrelease.yaml From 349d9c56ac4db8c9fa18b0923acc0917411beb4c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 18:55:11 -0300 Subject: [PATCH 15/71] monitoring: polish dashboards --- scripts/render_dashboards.py | 55 +++++++++++-------- .../monitoring/dashboards/atlas-network.json | 21 +++---- .../monitoring/dashboards/atlas-overview.json | 38 ++++++------- .../monitoring/dashboards/atlas-pods.json | 4 +- .../monitoring/dashboards/atlas-storage.json | 8 +-- .../monitoring/grafana-dashboard-network.yaml | 21 +++---- .../grafana-dashboard-overview.yaml | 38 ++++++------- .../monitoring/grafana-dashboard-pods.yaml | 4 +- .../monitoring/grafana-dashboard-storage.yaml | 8 +-- 9 files changed, 104 insertions(+), 93 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 67e486a..083ddfe 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -80,6 +80,7 @@ WORKER_TOTAL = len(WORKER_NODES) CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring" +LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" # --------------------------------------------------------------------------- # PromQL helpers @@ -149,9 +150,10 @@ CRASHLOOP_EXPR = ( '{reason=~"CrashLoopBackOff|ImagePullBackOff"}))' ) STUCK_TERMINATING_EXPR = ( - 'sum(max by (namespace,pod) ((' - '(time() - kube_pod_deletion_timestamp{pod!=""}) > 600' - ') and on(namespace,pod) kube_pod_deletion_timestamp{pod!=""} > 0))' + 'sum(max by (namespace,pod) (' + '((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)' + ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)' + '))' ) PROBLEM_TABLE_EXPR = ( @@ -168,9 +170,11 @@ CRASHLOOP_TABLE_EXPR = ( "(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" ) STUCK_TABLE_EXPR = ( + "(" "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) " - "* on(namespace,pod) group_left(node) kube_pod_info) " - "and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0" + "and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) " + "* on(namespace,pod) group_left(node) kube_pod_info" + ")" ) NAMESPACE_CPU_EXPR = ( @@ -192,6 +196,7 @@ IO_SERIES_EXPR = ( "+ rate(node_disk_written_bytes_total[5m]))" ) IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})" +TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( 'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) ' "or on() vector(0)" @@ -216,6 +221,7 @@ def stat_panel( thresholds=None, text_mode="value", legend=None, + display_name=None, value_suffix=None, links=None, ): @@ -236,6 +242,8 @@ def stat_panel( } if value_suffix: defaults["custom"]["valueSuffix"] = value_suffix + if display_name: + defaults["displayName"] = display_name panel = { "id": panel_id, "type": "stat", @@ -449,8 +457,8 @@ def build_overview(): hottest = [ (7, "Hottest node: CPU", f"topk(1, {node_cpu_expr()})", "percent"), (8, "Hottest node: RAM", f"topk(1, {node_mem_expr()})", "percent"), - (9, "Hottest node: NET", NET_TOP_EXPR, "bytes/sec"), - (10, "Hottest node: I/O", IO_TOP_EXPR, "bytes/sec"), + (9, "Hottest node: NET", NET_TOP_EXPR, "Bps"), + (10, "Hottest node: I/O", IO_TOP_EXPR, "Bps"), ] for idx, (panel_id, title, expr, unit) in enumerate(hottest): panels.append( @@ -462,7 +470,7 @@ def build_overview(): unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="value_and_name", - legend="{{node}}", + display_name="{{node}}", links=link_to("atlas-nodes"), ) ) @@ -544,7 +552,7 @@ def build_overview(): "Cluster ingress throughput", NET_INGRESS_EXPR, {"h": 7, "w": 12, "x": 0, "y": 34}, - unit="bytes/sec", + unit="Bps", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), @@ -556,7 +564,7 @@ def build_overview(): "Cluster egress throughput", NET_EGRESS_EXPR, {"h": 7, "w": 12, "x": 12, "y": 34}, - unit="bytes/sec", + unit="Bps", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), @@ -616,8 +624,8 @@ def build_overview(): storage_panels = [ (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), - (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "bytesSI"), - (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "bytesSI"), + (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), + (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), ] for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): panels.append( @@ -911,7 +919,7 @@ def build_storage_dashboard(): "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, - unit="bytesSI", + unit="decbytes", ) ) panels.append( @@ -920,14 +928,14 @@ def build_storage_dashboard(): "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, - unit="bytesSI", + unit="decbytes", ) ) panels.append( timeseries_panel( 5, "Astreae per-node usage", - filesystem_usage_expr("/mnt/astreae"), + filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 0, "y": 5}, unit="percent", legend="{{node}}", @@ -940,7 +948,7 @@ def build_storage_dashboard(): timeseries_panel( 6, "Asteria per-node usage", - filesystem_usage_expr("/mnt/asteria"), + filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 12, "y": 5}, unit="percent", legend="{{node}}", @@ -986,18 +994,19 @@ def build_storage_dashboard(): def build_network_dashboard(): panels = [] panels.append( - stat_panel(1, "Ingress bytes/s", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="bytes/sec") + stat_panel(1, "Ingress traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="Bps") ) panels.append( - stat_panel(2, "Egress bytes/s", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="bytes/sec") + stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps") ) panels.append( stat_panel( 3, "Top router req/s", - 'max(topk(1, rate(traefik_router_requests_total[5m])))', + f"topk(1, {TRAEFIK_ROUTER_EXPR})", {"h": 4, "w": 8, "x": 16, "y": 0}, unit="req/s", + display_name="{{router}}", ) ) panels.append( @@ -1006,7 +1015,7 @@ def build_network_dashboard(): "Per-node throughput", NET_SERIES_EXPR, {"h": 8, "w": 24, "x": 0, "y": 4}, - unit="bytes/sec", + unit="Bps", legend="{{node}}", legend_display="table", legend_placement="right", @@ -1019,7 +1028,7 @@ def build_network_dashboard(): 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 12}, - unit="bytes/sec", + unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) @@ -1030,7 +1039,7 @@ def build_network_dashboard(): 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', {"h": 9, "w": 12, "x": 12, "y": 12}, - unit="bytes/sec", + unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) @@ -1038,7 +1047,7 @@ def build_network_dashboard(): timeseries_panel( 7, "Traefik routers (req/s)", - 'topk(10, rate(traefik_router_requests_total[5m]))', + f"topk(10, {TRAEFIK_ROUTER_EXPR})", {"h": 9, "w": 12, "x": 0, "y": 21}, unit="req/s", legend="{{router}}", diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 3846d2a..369024f 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -7,7 +7,7 @@ { "id": 1, "type": "stat", - "title": "Ingress bytes/s", + "title": "Ingress traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -43,7 +43,7 @@ } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" } @@ -67,7 +67,7 @@ { "id": 2, "type": "stat", - "title": "Egress bytes/s", + "title": "Egress traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -103,7 +103,7 @@ } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" } @@ -140,7 +140,7 @@ }, "targets": [ { - "expr": "max(topk(1, rate(traefik_router_requests_total[5m])))", + "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A" } ], @@ -166,7 +166,8 @@ "unit": "req/s", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{router}}" }, "overrides": [] }, @@ -207,7 +208,7 @@ ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -243,7 +244,7 @@ ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -279,7 +280,7 @@ ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -309,7 +310,7 @@ }, "targets": [ { - "expr": "topk(10, rate(traefik_router_requests_total[5m]))", + "expr": "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", "legendFormat": "{{router}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 3377a13..ec7a848 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -339,7 +339,7 @@ }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "refId": "A" } ], @@ -407,8 +407,7 @@ "targets": [ { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -437,7 +436,8 @@ "unit": "percent", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -479,8 +479,7 @@ "targets": [ { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -509,7 +508,8 @@ "unit": "percent", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -551,8 +551,7 @@ "targets": [ { "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -574,10 +573,11 @@ } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -619,8 +619,7 @@ "targets": [ { "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -642,10 +641,11 @@ } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -944,7 +944,7 @@ ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -987,7 +987,7 @@ ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -1306,7 +1306,7 @@ } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } @@ -1373,7 +1373,7 @@ } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index 3e7dd0e..8494e89 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -140,7 +140,7 @@ }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "refId": "A" } ], @@ -332,7 +332,7 @@ }, "targets": [ { - "expr": "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0", + "expr": "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json index bb7d152..6585794 100644 --- a/services/monitoring/dashboards/atlas-storage.json +++ b/services/monitoring/dashboards/atlas-storage.json @@ -171,7 +171,7 @@ } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } @@ -231,7 +231,7 @@ } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } @@ -268,7 +268,7 @@ }, "targets": [ { - "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -306,7 +306,7 @@ }, "targets": [ { - "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index e1ba054..07c8b7a 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "stat", - "title": "Ingress bytes/s", + "title": "Ingress traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -52,7 +52,7 @@ data: } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" } @@ -76,7 +76,7 @@ data: { "id": 2, "type": "stat", - "title": "Egress bytes/s", + "title": "Egress traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -112,7 +112,7 @@ data: } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" } @@ -149,7 +149,7 @@ data: }, "targets": [ { - "expr": "max(topk(1, rate(traefik_router_requests_total[5m])))", + "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A" } ], @@ -175,7 +175,8 @@ data: "unit": "req/s", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{router}}" }, "overrides": [] }, @@ -216,7 +217,7 @@ data: ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -252,7 +253,7 @@ data: ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -288,7 +289,7 @@ data: ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -318,7 +319,7 @@ data: }, "targets": [ { - "expr": "topk(10, rate(traefik_router_requests_total[5m]))", + "expr": "topk(10, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", "legendFormat": "{{router}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 199dfb2..bb3bb11 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -348,7 +348,7 @@ data: }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "refId": "A" } ], @@ -416,8 +416,7 @@ data: "targets": [ { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -446,7 +445,8 @@ data: "unit": "percent", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -488,8 +488,7 @@ data: "targets": [ { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -518,7 +517,8 @@ data: "unit": "percent", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -560,8 +560,7 @@ data: "targets": [ { "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -583,10 +582,11 @@ data: } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -628,8 +628,7 @@ data: "targets": [ { "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", - "refId": "A", - "legendFormat": "{{node}}" + "refId": "A" } ], "fieldConfig": { @@ -651,10 +650,11 @@ data: } ] }, - "unit": "bytes/sec", + "unit": "Bps", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{node}}" }, "overrides": [] }, @@ -953,7 +953,7 @@ data: ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -996,7 +996,7 @@ data: ], "fieldConfig": { "defaults": { - "unit": "bytes/sec" + "unit": "Bps" }, "overrides": [] }, @@ -1315,7 +1315,7 @@ data: } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } @@ -1382,7 +1382,7 @@ data: } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 58cae77..e160eca 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -149,7 +149,7 @@ data: }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > 600) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0))", + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", "refId": "A" } ], @@ -341,7 +341,7 @@ data: }, "targets": [ { - "expr": "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) * on(namespace,pod) group_left(node) kube_pod_info) and on(namespace,pod) kube_pod_deletion_timestamp{pod!=\"\"} > 0", + "expr": "(((time() - kube_pod_deletion_timestamp{pod!=\"\"}) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) * on(namespace,pod) group_left(node) kube_pod_info)", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml index 99439fb..1bbf1ea 100644 --- a/services/monitoring/grafana-dashboard-storage.yaml +++ b/services/monitoring/grafana-dashboard-storage.yaml @@ -180,7 +180,7 @@ data: } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } @@ -240,7 +240,7 @@ data: } ] }, - "unit": "bytesSI", + "unit": "decbytes", "custom": { "displayMode": "auto" } @@ -277,7 +277,7 @@ data: }, "targets": [ { - "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -315,7 +315,7 @@ data: }, "targets": [ { - "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "(avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-1[2-9]|titan-2[24]\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } From fe8deea9c728bce682be9ee7954c3ddab6a6dd7e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 19:24:03 -0300 Subject: [PATCH 16/71] monitoring: tighten overview stats --- scripts/render_dashboards.py | 29 ++++++--- .../monitoring/dashboards/atlas-network.json | 6 +- .../monitoring/dashboards/atlas-overview.json | 64 ++++++++++++++----- .../monitoring/grafana-dashboard-network.yaml | 6 +- .../grafana-dashboard-overview.yaml | 64 ++++++++++++++----- 5 files changed, 121 insertions(+), 48 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 083ddfe..b88d5a4 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -221,7 +221,6 @@ def stat_panel( thresholds=None, text_mode="value", legend=None, - display_name=None, value_suffix=None, links=None, ): @@ -242,8 +241,6 @@ def stat_panel( } if value_suffix: defaults["custom"]["valueSuffix"] = value_suffix - if display_name: - defaults["displayName"] = display_name panel = { "id": panel_id, "type": "stat", @@ -385,7 +382,7 @@ def build_overview(): (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None), ( 2, - "Ready nodes", + "Ready workers", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', WORKER_SUFFIX, WORKER_TOTAL, @@ -426,20 +423,32 @@ def build_overview(): ] for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): thresholds = None - if panel_id in (2, 3): + if panel_id == 2: thresholds = { "mode": "absolute", "steps": [ {"color": "red", "value": None}, - {"color": "green", "value": ok_value}, + {"color": "orange", "value": WORKER_TOTAL - 2}, + {"color": "yellow", "value": WORKER_TOTAL - 1}, + {"color": "green", "value": WORKER_TOTAL}, ], } - elif panel_id >= 4: + elif panel_id == 3: + thresholds = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "green", "value": CONTROL_TOTAL}, + ], + } + elif panel_id in (4, 5, 6): thresholds = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, - {"color": "red", "value": 1}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 2}, + {"color": "red", "value": 3}, ], } panels.append( @@ -470,7 +479,7 @@ def build_overview(): unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="value_and_name", - display_name="{{node}}", + legend="{{node}}", links=link_to("atlas-nodes"), ) ) @@ -1006,7 +1015,7 @@ def build_network_dashboard(): f"topk(1, {TRAEFIK_ROUTER_EXPR})", {"h": 4, "w": 8, "x": 16, "y": 0}, unit="req/s", - display_name="{{router}}", + legend="{{router}}", ) ) panels.append( diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 369024f..e412045 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -141,7 +141,8 @@ "targets": [ { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", - "refId": "A" + "refId": "A", + "legendFormat": "{{router}}" } ], "fieldConfig": { @@ -166,8 +167,7 @@ "unit": "req/s", "custom": { "displayMode": "auto" - }, - "displayName": "{{router}}" + } }, "overrides": [] }, diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index ec7a848..ec137f1 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -70,7 +70,7 @@ { "id": 2, "type": "stat", - "title": "Ready nodes", + "title": "Ready workers", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -100,6 +100,14 @@ "color": "red", "value": null }, + { + "color": "orange", + "value": 16 + }, + { + "color": "yellow", + "value": 17 + }, { "color": "green", "value": 18 @@ -223,8 +231,16 @@ "value": null }, { - "color": "red", + "color": "yellow", "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 } ] }, @@ -290,8 +306,16 @@ "value": null }, { - "color": "red", + "color": "yellow", "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 } ] }, @@ -357,8 +381,16 @@ "value": null }, { - "color": "red", + "color": "yellow", "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 } ] }, @@ -407,7 +439,8 @@ "targets": [ { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A" + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -436,8 +469,7 @@ "unit": "percent", "custom": { "displayMode": "auto" - }, - "displayName": "{{node}}" + } }, "overrides": [] }, @@ -479,7 +511,8 @@ "targets": [ { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A" + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -508,8 +541,7 @@ "unit": "percent", "custom": { "displayMode": "auto" - }, - "displayName": "{{node}}" + } }, "overrides": [] }, @@ -551,7 +583,8 @@ "targets": [ { "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", - "refId": "A" + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -576,8 +609,7 @@ "unit": "Bps", "custom": { "displayMode": "auto" - }, - "displayName": "{{node}}" + } }, "overrides": [] }, @@ -619,7 +651,8 @@ "targets": [ { "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", - "refId": "A" + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -644,8 +677,7 @@ "unit": "Bps", "custom": { "displayMode": "auto" - }, - "displayName": "{{node}}" + } }, "overrides": [] }, diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 07c8b7a..6963e89 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -150,7 +150,8 @@ data: "targets": [ { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", - "refId": "A" + "refId": "A", + "legendFormat": "{{router}}" } ], "fieldConfig": { @@ -175,8 +176,7 @@ data: "unit": "req/s", "custom": { "displayMode": "auto" - }, - "displayName": "{{router}}" + } }, "overrides": [] }, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index bb3bb11..12555ee 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -79,7 +79,7 @@ data: { "id": 2, "type": "stat", - "title": "Ready nodes", + "title": "Ready workers", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -109,6 +109,14 @@ data: "color": "red", "value": null }, + { + "color": "orange", + "value": 16 + }, + { + "color": "yellow", + "value": 17 + }, { "color": "green", "value": 18 @@ -232,8 +240,16 @@ data: "value": null }, { - "color": "red", + "color": "yellow", "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 } ] }, @@ -299,8 +315,16 @@ data: "value": null }, { - "color": "red", + "color": "yellow", "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 } ] }, @@ -366,8 +390,16 @@ data: "value": null }, { - "color": "red", + "color": "yellow", "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 } ] }, @@ -416,7 +448,8 @@ data: "targets": [ { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A" + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -445,8 +478,7 @@ data: "unit": "percent", "custom": { "displayMode": "auto" - }, - "displayName": "{{node}}" + } }, "overrides": [] }, @@ -488,7 +520,8 @@ data: "targets": [ { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A" + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -517,8 +550,7 @@ data: "unit": "percent", "custom": { "displayMode": "auto" - }, - "displayName": "{{node}}" + } }, "overrides": [] }, @@ -560,7 +592,8 @@ data: "targets": [ { "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", - "refId": "A" + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -585,8 +618,7 @@ data: "unit": "Bps", "custom": { "displayMode": "auto" - }, - "displayName": "{{node}}" + } }, "overrides": [] }, @@ -628,7 +660,8 @@ data: "targets": [ { "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", - "refId": "A" + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -653,8 +686,7 @@ data: "unit": "Bps", "custom": { "displayMode": "auto" - }, - "displayName": "{{node}}" + } }, "overrides": [] }, From a1e731e9299a5ca1908a4953d5d1729e9e6d74e9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 19:38:40 -0300 Subject: [PATCH 17/71] monitoring: fix hottest stats and titan-db scrape --- scripts/render_dashboards.py | 17 ++++++------ .../monitoring/dashboards/atlas-network.json | 3 ++- .../monitoring/dashboards/atlas-overview.json | 26 +++++++++++-------- .../monitoring/grafana-dashboard-network.yaml | 3 ++- .../grafana-dashboard-overview.yaml | 26 +++++++++++-------- services/monitoring/helmrelease.yaml | 10 +++++++ 6 files changed, 52 insertions(+), 33 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index b88d5a4..d726015 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -221,6 +221,7 @@ def stat_panel( thresholds=None, text_mode="value", legend=None, + instant=False, value_suffix=None, links=None, ): @@ -259,6 +260,8 @@ def stat_panel( } if legend: panel["targets"][0]["legendFormat"] = legend + if instant: + panel["targets"][0]["instant"] = True if links: panel["links"] = links return panel @@ -339,14 +342,8 @@ def pie_panel(panel_id, title, expr, grid): "title": title, "datasource": PROM_DS, "gridPos": grid, - "targets": [{"expr": expr, "refId": "A"}], - "fieldConfig": { - "defaults": { - "unit": "percent", - "displayName": "{{namespace}}", - }, - "overrides": [], - }, + "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], + "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []}, "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", @@ -382,7 +379,7 @@ def build_overview(): (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None), ( 2, - "Ready workers", + "Workers ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', WORKER_SUFFIX, WORKER_TOTAL, @@ -480,6 +477,7 @@ def build_overview(): thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="value_and_name", legend="{{node}}", + instant=True, links=link_to("atlas-nodes"), ) ) @@ -1016,6 +1014,7 @@ def build_network_dashboard(): {"h": 4, "w": 8, "x": 16, "y": 0}, unit="req/s", legend="{{router}}", + instant=True, ) ) panels.append( diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index e412045..abd9da7 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -142,7 +142,8 @@ { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", - "legendFormat": "{{router}}" + "legendFormat": "{{router}}", + "instant": true } ], "fieldConfig": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index ec137f1..1442cf5 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -70,7 +70,7 @@ { "id": 2, "type": "stat", - "title": "Ready workers", + "title": "Workers ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -440,7 +440,8 @@ { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -512,7 +513,8 @@ { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -584,7 +586,8 @@ { "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -652,7 +655,8 @@ { "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -719,13 +723,13 @@ "targets": [ { "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", - "refId": "A" + "refId": "A", + "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { - "unit": "percent", - "displayName": "{{namespace}}" + "unit": "percent" }, "overrides": [] }, @@ -761,13 +765,13 @@ "targets": [ { "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", - "refId": "A" + "refId": "A", + "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { - "unit": "percent", - "displayName": "{{namespace}}" + "unit": "percent" }, "overrides": [] }, diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 6963e89..8f614ae 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -151,7 +151,8 @@ data: { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", - "legendFormat": "{{router}}" + "legendFormat": "{{router}}", + "instant": true } ], "fieldConfig": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 12555ee..ac95eae 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -79,7 +79,7 @@ data: { "id": 2, "type": "stat", - "title": "Ready workers", + "title": "Workers ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -449,7 +449,8 @@ data: { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -521,7 +522,8 @@ data: { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -593,7 +595,8 @@ data: { "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -661,7 +664,8 @@ data: { "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -728,13 +732,13 @@ data: "targets": [ { "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", - "refId": "A" + "refId": "A", + "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { - "unit": "percent", - "displayName": "{{namespace}}" + "unit": "percent" }, "overrides": [] }, @@ -770,13 +774,13 @@ data: "targets": [ { "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", - "refId": "A" + "refId": "A", + "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { - "unit": "percent", - "displayName": "{{namespace}}" + "unit": "percent" }, "overrides": [] }, diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 58035b6..5a8f1ba 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -209,6 +209,16 @@ spec: - action: keep source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of] regex: flux-system;flux + - job_name: "titan-db" + static_configs: + - targets: ["titan-db:9100"] + relabel_configs: + - source_labels: [__address__] + target_label: instance + metric_relabel_configs: + - source_labels: [instance] + target_label: node + replacement: titan-db --- From 41e8a6a5829fa54dd54f4f7e0b36020f0a8cc371 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 19:49:50 -0300 Subject: [PATCH 18/71] monitoring: reorder overview stats --- scripts/render_dashboards.py | 20 +-- .../monitoring/dashboards/atlas-overview.json | 146 +++++++++--------- .../grafana-dashboard-overview.yaml | 146 +++++++++--------- 3 files changed, 156 insertions(+), 156 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index d726015..97d64cd 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -376,9 +376,8 @@ def build_overview(): panels = [] row1_stats = [ - (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None), ( - 2, + 1, "Workers ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', WORKER_SUFFIX, @@ -386,7 +385,7 @@ def build_overview(): None, ), ( - 3, + 2, "Control plane ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', CONTROL_SUFFIX, @@ -394,7 +393,7 @@ def build_overview(): None, ), ( - 4, + 3, "Control plane workloads", f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', None, @@ -402,7 +401,7 @@ def build_overview(): link_to("atlas-pods"), ), ( - 5, + 4, "Problem pods", PROBLEM_PODS_EXPR, None, @@ -410,17 +409,18 @@ def build_overview(): link_to("atlas-pods"), ), ( - 6, + 5, "Stuck terminating", STUCK_TERMINATING_EXPR, None, 1, link_to("atlas-pods"), ), + (6, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None), ] for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): thresholds = None - if panel_id == 2: + if panel_id == 1: thresholds = { "mode": "absolute", "steps": [ @@ -430,7 +430,7 @@ def build_overview(): {"color": "green", "value": WORKER_TOTAL}, ], } - elif panel_id == 3: + elif panel_id == 2: thresholds = { "mode": "absolute", "steps": [ @@ -438,7 +438,7 @@ def build_overview(): {"color": "green", "value": CONTROL_TOTAL}, ], } - elif panel_id in (4, 5, 6): + elif panel_id in (3, 4, 5): thresholds = { "mode": "absolute", "steps": [ @@ -475,7 +475,7 @@ def build_overview(): {"h": 5, "w": 6, "x": 6 * idx, "y": 5}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, - text_mode="value_and_name", + text_mode="name_and_value", legend="{{node}}", instant=True, links=link_to("atlas-nodes"), diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 1442cf5..d51d203 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -10,66 +10,6 @@ { "id": 1, "type": "stat", - "title": "Running pods", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 2, - "type": "stat", "title": "Workers ready", "datasource": { "type": "prometheus", @@ -78,7 +18,7 @@ "gridPos": { "h": 5, "w": 4, - "x": 4, + "x": 0, "y": 0 }, "targets": [ @@ -137,7 +77,7 @@ } }, { - "id": 3, + "id": 2, "type": "stat", "title": "Control plane ready", "datasource": { @@ -147,7 +87,7 @@ "gridPos": { "h": 5, "w": 4, - "x": 8, + "x": 4, "y": 0 }, "targets": [ @@ -198,7 +138,7 @@ } }, { - "id": 4, + "id": 3, "type": "stat", "title": "Control plane workloads", "datasource": { @@ -208,7 +148,7 @@ "gridPos": { "h": 5, "w": 4, - "x": 12, + "x": 8, "y": 0 }, "targets": [ @@ -273,7 +213,7 @@ ] }, { - "id": 5, + "id": 4, "type": "stat", "title": "Problem pods", "datasource": { @@ -283,7 +223,7 @@ "gridPos": { "h": 5, "w": 4, - "x": 16, + "x": 12, "y": 0 }, "targets": [ @@ -348,7 +288,7 @@ ] }, { - "id": 6, + "id": 5, "type": "stat", "title": "Stuck terminating", "datasource": { @@ -358,7 +298,7 @@ "gridPos": { "h": 5, "w": 4, - "x": 20, + "x": 16, "y": 0 }, "targets": [ @@ -422,6 +362,66 @@ } ] }, + { + "id": 6, + "type": "stat", + "title": "Running pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, { "id": 7, "type": "stat", @@ -485,7 +485,7 @@ "fields": "", "values": false }, - "textMode": "value_and_name" + "textMode": "name_and_value" }, "links": [ { @@ -558,7 +558,7 @@ "fields": "", "values": false }, - "textMode": "value_and_name" + "textMode": "name_and_value" }, "links": [ { @@ -627,7 +627,7 @@ "fields": "", "values": false }, - "textMode": "value_and_name" + "textMode": "name_and_value" }, "links": [ { @@ -696,7 +696,7 @@ "fields": "", "values": false }, - "textMode": "value_and_name" + "textMode": "name_and_value" }, "links": [ { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index ac95eae..8d03cf6 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -19,66 +19,6 @@ data: { "id": 1, "type": "stat", - "title": "Running pods", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 2, - "type": "stat", "title": "Workers ready", "datasource": { "type": "prometheus", @@ -87,7 +27,7 @@ data: "gridPos": { "h": 5, "w": 4, - "x": 4, + "x": 0, "y": 0 }, "targets": [ @@ -146,7 +86,7 @@ data: } }, { - "id": 3, + "id": 2, "type": "stat", "title": "Control plane ready", "datasource": { @@ -156,7 +96,7 @@ data: "gridPos": { "h": 5, "w": 4, - "x": 8, + "x": 4, "y": 0 }, "targets": [ @@ -207,7 +147,7 @@ data: } }, { - "id": 4, + "id": 3, "type": "stat", "title": "Control plane workloads", "datasource": { @@ -217,7 +157,7 @@ data: "gridPos": { "h": 5, "w": 4, - "x": 12, + "x": 8, "y": 0 }, "targets": [ @@ -282,7 +222,7 @@ data: ] }, { - "id": 5, + "id": 4, "type": "stat", "title": "Problem pods", "datasource": { @@ -292,7 +232,7 @@ data: "gridPos": { "h": 5, "w": 4, - "x": 16, + "x": 12, "y": 0 }, "targets": [ @@ -357,7 +297,7 @@ data: ] }, { - "id": 6, + "id": 5, "type": "stat", "title": "Stuck terminating", "datasource": { @@ -367,7 +307,7 @@ data: "gridPos": { "h": 5, "w": 4, - "x": 20, + "x": 16, "y": 0 }, "targets": [ @@ -431,6 +371,66 @@ data: } ] }, + { + "id": 6, + "type": "stat", + "title": "Running pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, { "id": 7, "type": "stat", @@ -494,7 +494,7 @@ data: "fields": "", "values": false }, - "textMode": "value_and_name" + "textMode": "name_and_value" }, "links": [ { @@ -567,7 +567,7 @@ data: "fields": "", "values": false }, - "textMode": "value_and_name" + "textMode": "name_and_value" }, "links": [ { @@ -636,7 +636,7 @@ data: "fields": "", "values": false }, - "textMode": "value_and_name" + "textMode": "name_and_value" }, "links": [ { @@ -705,7 +705,7 @@ data: "fields": "", "values": false }, - "textMode": "value_and_name" + "textMode": "name_and_value" }, "links": [ { From bcaa0a33279e4b1223eac2e560eb925cdfdb8197 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 19:53:39 -0300 Subject: [PATCH 19/71] monitoring: show hottest node names --- scripts/render_dashboards.py | 4 ++++ services/monitoring/dashboards/atlas-overview.json | 12 ++++++++---- services/monitoring/grafana-dashboard-overview.yaml | 12 ++++++++---- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 97d64cd..a9c319a 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -224,6 +224,7 @@ def stat_panel( instant=False, value_suffix=None, links=None, + display_name=None, ): """Return a Grafana stat panel definition.""" defaults = { @@ -242,6 +243,8 @@ def stat_panel( } if value_suffix: defaults["custom"]["valueSuffix"] = value_suffix + if display_name: + defaults["displayName"] = display_name panel = { "id": panel_id, "type": "stat", @@ -478,6 +481,7 @@ def build_overview(): text_mode="name_and_value", legend="{{node}}", instant=True, + display_name="{{__field.labels.node}}\\n", links=link_to("atlas-nodes"), ) ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index d51d203..f0cceaf 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -470,7 +470,8 @@ "unit": "percent", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{__field.labels.node}}\\n" }, "overrides": [] }, @@ -543,7 +544,8 @@ "unit": "percent", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{__field.labels.node}}\\n" }, "overrides": [] }, @@ -612,7 +614,8 @@ "unit": "Bps", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{__field.labels.node}}\\n" }, "overrides": [] }, @@ -681,7 +684,8 @@ "unit": "Bps", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{__field.labels.node}}\\n" }, "overrides": [] }, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 8d03cf6..1839d8f 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -479,7 +479,8 @@ data: "unit": "percent", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{__field.labels.node}}\\n" }, "overrides": [] }, @@ -552,7 +553,8 @@ data: "unit": "percent", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{__field.labels.node}}\\n" }, "overrides": [] }, @@ -621,7 +623,8 @@ data: "unit": "Bps", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{__field.labels.node}}\\n" }, "overrides": [] }, @@ -690,7 +693,8 @@ data: "unit": "Bps", "custom": { "displayMode": "auto" - } + }, + "displayName": "{{__field.labels.node}}\\n" }, "overrides": [] }, From 4aece7e5cb5e0972fb7419eaaae6ee02ef64909e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 19:56:57 -0300 Subject: [PATCH 20/71] monitoring: fix hottest node labels --- scripts/render_dashboards.py | 20 +++++++++------- .../monitoring/dashboards/atlas-overview.json | 24 +++++++------------ .../grafana-dashboard-overview.yaml | 24 +++++++------------ 3 files changed, 27 insertions(+), 41 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index a9c319a..acc1c38 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -144,6 +144,12 @@ def astreae_free_expr(mount): return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" +def hottest_stat_expr(inner_expr): + return ( + f'label_replace(topk(1, {inner_expr}), "__name__", "$1", "node", "(.*)")' + ) + + PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' @@ -224,7 +230,6 @@ def stat_panel( instant=False, value_suffix=None, links=None, - display_name=None, ): """Return a Grafana stat panel definition.""" defaults = { @@ -243,8 +248,6 @@ def stat_panel( } if value_suffix: defaults["custom"]["valueSuffix"] = value_suffix - if display_name: - defaults["displayName"] = display_name panel = { "id": panel_id, "type": "stat", @@ -464,10 +467,10 @@ def build_overview(): ) hottest = [ - (7, "Hottest node: CPU", f"topk(1, {node_cpu_expr()})", "percent"), - (8, "Hottest node: RAM", f"topk(1, {node_mem_expr()})", "percent"), - (9, "Hottest node: NET", NET_TOP_EXPR, "Bps"), - (10, "Hottest node: I/O", IO_TOP_EXPR, "Bps"), + (7, "Hottest node: CPU", hottest_stat_expr(node_cpu_expr()), "percent"), + (8, "Hottest node: RAM", hottest_stat_expr(node_mem_expr()), "percent"), + (9, "Hottest node: NET", hottest_stat_expr(NET_SERIES_EXPR), "Bps"), + (10, "Hottest node: I/O", hottest_stat_expr(IO_SERIES_EXPR), "Bps"), ] for idx, (panel_id, title, expr, unit) in enumerate(hottest): panels.append( @@ -479,9 +482,8 @@ def build_overview(): unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="name_and_value", - legend="{{node}}", + legend=None, instant=True, - display_name="{{__field.labels.node}}\\n", links=link_to("atlas-nodes"), ) ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index f0cceaf..ea4e40e 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -438,9 +438,8 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -470,8 +469,7 @@ "unit": "percent", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, @@ -512,9 +510,8 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -544,8 +541,7 @@ "unit": "percent", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, @@ -586,9 +582,8 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", + "expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -614,8 +609,7 @@ "unit": "Bps", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, @@ -656,9 +650,8 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", + "expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -684,8 +677,7 @@ "unit": "Bps", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 1839d8f..1df2956 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -447,9 +447,8 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -479,8 +478,7 @@ data: "unit": "percent", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, @@ -521,9 +519,8 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -553,8 +550,7 @@ data: "unit": "percent", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, @@ -595,9 +591,8 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", + "expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -623,8 +618,7 @@ data: "unit": "Bps", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, @@ -665,9 +659,8 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", + "expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", - "legendFormat": "{{node}}", "instant": true } ], @@ -693,8 +686,7 @@ data: "unit": "Bps", "custom": { "displayMode": "auto" - }, - "displayName": "{{__field.labels.node}}\\n" + } }, "overrides": [] }, From b28e7501b72d52ea5101f11c80e29fcc6946be14 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 20:00:40 -0300 Subject: [PATCH 21/71] monitoring: show hottest node labels --- scripts/render_dashboards.py | 18 ++++++------------ .../monitoring/dashboards/atlas-overview.json | 16 ++++++++++------ .../monitoring/grafana-dashboard-overview.yaml | 16 ++++++++++------ 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index acc1c38..e215ca8 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -144,12 +144,6 @@ def astreae_free_expr(mount): return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" -def hottest_stat_expr(inner_expr): - return ( - f'label_replace(topk(1, {inner_expr}), "__name__", "$1", "node", "(.*)")' - ) - - PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' @@ -467,22 +461,22 @@ def build_overview(): ) hottest = [ - (7, "Hottest node: CPU", hottest_stat_expr(node_cpu_expr()), "percent"), - (8, "Hottest node: RAM", hottest_stat_expr(node_mem_expr()), "percent"), - (9, "Hottest node: NET", hottest_stat_expr(NET_SERIES_EXPR), "Bps"), - (10, "Hottest node: I/O", hottest_stat_expr(IO_SERIES_EXPR), "Bps"), + (7, "Hottest node: CPU", node_cpu_expr(), "percent"), + (8, "Hottest node: RAM", node_mem_expr(), "percent"), + (9, "Hottest node: NET (rx+tx)", NET_SERIES_EXPR, "Bps"), + (10, "Hottest node: I/O (r+w)", IO_SERIES_EXPR, "Bps"), ] for idx, (panel_id, title, expr, unit) in enumerate(hottest): panels.append( stat_panel( panel_id, title, - expr, + f"topk(1, {expr})", {"h": 5, "w": 6, "x": 6 * idx, "y": 5}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="name_and_value", - legend=None, + legend="{{node}}", instant=True, links=link_to("atlas-nodes"), ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index ea4e40e..468ca8a 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -438,8 +438,9 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], @@ -510,8 +511,9 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], @@ -569,7 +571,7 @@ { "id": 9, "type": "stat", - "title": "Hottest node: NET", + "title": "Hottest node: NET (rx+tx)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -582,8 +584,9 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], @@ -637,7 +640,7 @@ { "id": 10, "type": "stat", - "title": "Hottest node: I/O", + "title": "Hottest node: I/O (r+w)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -650,8 +653,9 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 1df2956..dbcc916 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -447,8 +447,9 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], @@ -519,8 +520,9 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], @@ -578,7 +580,7 @@ data: { "id": 9, "type": "stat", - "title": "Hottest node: NET", + "title": "Hottest node: NET (rx+tx)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -591,8 +593,9 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], @@ -646,7 +649,7 @@ data: { "id": 10, "type": "stat", - "title": "Hottest node: I/O", + "title": "Hottest node: I/O (r+w)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -659,8 +662,9 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", + "legendFormat": "{{node}}", "instant": true } ], From a67a6a1f3a3e903dd86cff14819ad63b71ee5401 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 20:04:50 -0300 Subject: [PATCH 22/71] monitoring: tidy hottest node labels --- scripts/render_dashboards.py | 2 +- services/monitoring/dashboards/atlas-overview.json | 8 ++++---- services/monitoring/grafana-dashboard-overview.yaml | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index e215ca8..4f25ab5 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -476,7 +476,7 @@ def build_overview(): unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="name_and_value", - legend="{{node}}", + legend="{{node}}\\n", instant=True, links=link_to("atlas-nodes"), ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 468ca8a..ad3a947 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -440,7 +440,7 @@ { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}", + "legendFormat": "{{node}}\\n", "instant": true } ], @@ -513,7 +513,7 @@ { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}", + "legendFormat": "{{node}}\\n", "instant": true } ], @@ -586,7 +586,7 @@ { "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", "refId": "A", - "legendFormat": "{{node}}", + "legendFormat": "{{node}}\\n", "instant": true } ], @@ -655,7 +655,7 @@ { "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", - "legendFormat": "{{node}}", + "legendFormat": "{{node}}\\n", "instant": true } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index dbcc916..5f3062a 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -449,7 +449,7 @@ data: { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}", + "legendFormat": "{{node}}\\n", "instant": true } ], @@ -522,7 +522,7 @@ data: { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}", + "legendFormat": "{{node}}\\n", "instant": true } ], @@ -595,7 +595,7 @@ data: { "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", "refId": "A", - "legendFormat": "{{node}}", + "legendFormat": "{{node}}\\n", "instant": true } ], @@ -664,7 +664,7 @@ data: { "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", - "legendFormat": "{{node}}", + "legendFormat": "{{node}}\\n", "instant": true } ], From b8998a3c6ab81493b52ebf18abc21a78ad6c01e9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 20:14:11 -0300 Subject: [PATCH 23/71] monitoring: attach nodes to net/io stats --- scripts/render_dashboards.py | 36 +++++++++++-------- .../monitoring/dashboards/atlas-network.json | 2 +- .../monitoring/dashboards/atlas-overview.json | 12 +++---- .../monitoring/grafana-dashboard-network.yaml | 2 +- .../grafana-dashboard-overview.yaml | 12 +++---- 5 files changed, 35 insertions(+), 29 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 4f25ab5..37f2607 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -144,6 +144,23 @@ def astreae_free_expr(mount): return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" +def node_net_expr(scope=""): + base = ( + 'sum by (instance) (' + 'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) ' + '+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))' + ) + return scoped_node_expr(base, scope) + + +def node_io_expr(scope=""): + base = ( + "sum by (instance) (rate(node_disk_read_bytes_total[5m]) " + "+ rate(node_disk_written_bytes_total[5m]))" + ) + return scoped_node_expr(base, scope) + + PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' @@ -185,17 +202,6 @@ NAMESPACE_RAM_EXPR = ( 'topk(10, sum(container_memory_working_set_bytes{namespace!=""' ',pod!=""}) by (namespace))' ) -NET_SERIES_EXPR = ( - 'avg by (node) (' - 'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) ' - '+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))' -) -NET_TOP_EXPR = f"topk(1, {NET_SERIES_EXPR})" -IO_SERIES_EXPR = ( - "avg by (node) (rate(node_disk_read_bytes_total[5m]) " - "+ rate(node_disk_written_bytes_total[5m]))" -) -IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})" TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( 'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) ' @@ -463,8 +469,8 @@ def build_overview(): hottest = [ (7, "Hottest node: CPU", node_cpu_expr(), "percent"), (8, "Hottest node: RAM", node_mem_expr(), "percent"), - (9, "Hottest node: NET (rx+tx)", NET_SERIES_EXPR, "Bps"), - (10, "Hottest node: I/O (r+w)", IO_SERIES_EXPR, "Bps"), + (9, "Hottest node: NET (rx+tx)", node_net_expr(), "Bps"), + (10, "Hottest node: I/O (r+w)", node_io_expr(), "Bps"), ] for idx, (panel_id, title, expr, unit) in enumerate(hottest): panels.append( @@ -476,7 +482,7 @@ def build_overview(): unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="name_and_value", - legend="{{node}}\\n", + legend="{{node}}", instant=True, links=link_to("atlas-nodes"), ) @@ -1021,7 +1027,7 @@ def build_network_dashboard(): timeseries_panel( 4, "Per-node throughput", - NET_SERIES_EXPR, + node_net_expr(), {"h": 8, "w": 24, "x": 0, "y": 4}, unit="Bps", legend="{{node}}", diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index abd9da7..42026eb 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -202,7 +202,7 @@ }, "targets": [ { - "expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))", + "expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index ad3a947..be5dead 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -440,7 +440,7 @@ { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], @@ -513,7 +513,7 @@ { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], @@ -584,9 +584,9 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", + "expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], @@ -653,9 +653,9 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", + "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 8f614ae..8b5d50d 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -211,7 +211,7 @@ data: }, "targets": [ { - "expr": "avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))", + "expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 5f3062a..26e0454 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -449,7 +449,7 @@ data: { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], @@ -522,7 +522,7 @@ data: { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], @@ -593,9 +593,9 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", + "expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], @@ -662,9 +662,9 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", + "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}\\n", + "legendFormat": "{{node}}", "instant": true } ], From 53427cc8fa893d22814aa5d94c338fcd3db8107b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 20:19:20 -0300 Subject: [PATCH 24/71] monitoring: fix net/io legend labels --- scripts/render_dashboards.py | 10 +++++++--- services/monitoring/dashboards/atlas-overview.json | 8 ++++---- services/monitoring/grafana-dashboard-overview.yaml | 8 ++++---- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 37f2607..cf9487f 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -144,6 +144,10 @@ def astreae_free_expr(mount): return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" +def topk_with_node(expr): + return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")' + + def node_net_expr(scope=""): base = ( 'sum by (instance) (' @@ -469,15 +473,15 @@ def build_overview(): hottest = [ (7, "Hottest node: CPU", node_cpu_expr(), "percent"), (8, "Hottest node: RAM", node_mem_expr(), "percent"), - (9, "Hottest node: NET (rx+tx)", node_net_expr(), "Bps"), - (10, "Hottest node: I/O (r+w)", node_io_expr(), "Bps"), + (9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"), + (10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"), ] for idx, (panel_id, title, expr, unit) in enumerate(hottest): panels.append( stat_panel( panel_id, title, - f"topk(1, {expr})", + f"{expr}", {"h": 5, "w": 6, "x": 6 * idx, "y": 5}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index be5dead..e116b34 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -438,7 +438,7 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -511,7 +511,7 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -584,7 +584,7 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -653,7 +653,7 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 26e0454..36f610b 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -447,7 +447,7 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -520,7 +520,7 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -593,7 +593,7 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -662,7 +662,7 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true From 76d3dc6ae238e880dfe7f39d3a9b04b2ef3fbea0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 21:20:19 -0300 Subject: [PATCH 25/71] monitoring: restore top1 hottest stats --- scripts/render_dashboards.py | 8 ++++---- services/monitoring/dashboards/atlas-network.json | 2 +- services/monitoring/dashboards/atlas-overview.json | 6 +++--- services/monitoring/grafana-dashboard-network.yaml | 2 +- services/monitoring/grafana-dashboard-overview.yaml | 6 +++--- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index cf9487f..5d5c049 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -151,8 +151,8 @@ def topk_with_node(expr): def node_net_expr(scope=""): base = ( 'sum by (instance) (' - 'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) ' - '+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))' + 'rate(node_network_receive_bytes_total{device!~"lo"}[5m]) ' + '+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))' ) return scoped_node_expr(base, scope) @@ -471,8 +471,8 @@ def build_overview(): ) hottest = [ - (7, "Hottest node: CPU", node_cpu_expr(), "percent"), - (8, "Hottest node: RAM", node_mem_expr(), "percent"), + (7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"), + (8, "Hottest node: RAM", topk_with_node(node_mem_expr()), "percent"), (9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"), (10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"), ] diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 42026eb..0363b81 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -202,7 +202,7 @@ }, "targets": [ { - "expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index e116b34..7f65265 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -438,7 +438,7 @@ }, "targets": [ { - "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -511,7 +511,7 @@ }, "targets": [ { - "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -584,7 +584,7 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 8b5d50d..2d7d989 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -211,7 +211,7 @@ data: }, "targets": [ { - "expr": "avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 36f610b..c1f8715 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -447,7 +447,7 @@ data: }, "targets": [ { - "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -520,7 +520,7 @@ data: }, "targets": [ { - "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -593,7 +593,7 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true From b59677615c7145657f8d67c99699b30bcf86314a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 21:48:12 -0300 Subject: [PATCH 26/71] monitoring: worker/control-plane splits --- scripts/render_dashboards.py | 17 +++++++++-------- .../monitoring/dashboards/atlas-overview.json | 16 ++++++++-------- .../monitoring/grafana-dashboard-overview.yaml | 16 ++++++++-------- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 5d5c049..cf34d6a 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -509,11 +509,12 @@ def build_overview(): ) ) + worker_filter = f"{WORKER_REGEX}" panels.append( timeseries_panel( 13, - "Cluster node CPU", - node_cpu_expr(), + "Worker node CPU", + node_cpu_expr(worker_filter), {"h": 8, "w": 12, "x": 0, "y": 19}, unit="percent", legend="{{node}}", @@ -526,8 +527,8 @@ def build_overview(): panels.append( timeseries_panel( 14, - "Cluster node RAM", - node_mem_expr(), + "Worker node RAM", + node_mem_expr(worker_filter), {"h": 8, "w": 12, "x": 12, "y": 19}, unit="percent", legend="{{node}}", @@ -541,8 +542,8 @@ def build_overview(): panels.append( timeseries_panel( 15, - "Control plane CPU (incl. titan-db)", - node_cpu_expr(CONTROL_ALL_REGEX), + "Control plane CPU", + node_cpu_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 0, "y": 27}, unit="percent", legend="{{node}}", @@ -553,8 +554,8 @@ def build_overview(): panels.append( timeseries_panel( 16, - "Control plane RAM (incl. titan-db)", - node_mem_expr(CONTROL_ALL_REGEX), + "Control plane RAM", + node_mem_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 12, "y": 27}, unit="percent", legend="{{node}}", diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 7f65265..bd081a7 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -793,7 +793,7 @@ { "id": 13, "type": "timeseries", - "title": "Cluster node CPU", + "title": "Worker node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -806,7 +806,7 @@ }, "targets": [ { - "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -840,7 +840,7 @@ { "id": 14, "type": "timeseries", - "title": "Cluster node RAM", + "title": "Worker node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -853,7 +853,7 @@ }, "targets": [ { - "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -887,7 +887,7 @@ { "id": 15, "type": "timeseries", - "title": "Control plane CPU (incl. titan-db)", + "title": "Control plane CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -900,7 +900,7 @@ }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -924,7 +924,7 @@ { "id": 16, "type": "timeseries", - "title": "Control plane RAM (incl. titan-db)", + "title": "Control plane RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -937,7 +937,7 @@ }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index c1f8715..fb3d111 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -802,7 +802,7 @@ data: { "id": 13, "type": "timeseries", - "title": "Cluster node CPU", + "title": "Worker node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -815,7 +815,7 @@ data: }, "targets": [ { - "expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -849,7 +849,7 @@ data: { "id": 14, "type": "timeseries", - "title": "Cluster node RAM", + "title": "Worker node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -862,7 +862,7 @@ data: }, "targets": [ { - "expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -896,7 +896,7 @@ data: { "id": 15, "type": "timeseries", - "title": "Control plane CPU (incl. titan-db)", + "title": "Control plane CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -909,7 +909,7 @@ data: }, "targets": [ { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } @@ -933,7 +933,7 @@ data: { "id": 16, "type": "timeseries", - "title": "Control plane RAM (incl. titan-db)", + "title": "Control plane RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -946,7 +946,7 @@ data: }, "targets": [ { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}" } From be6052c47c18c1bcef61af0046fa77d432a369cc Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 21:57:40 -0300 Subject: [PATCH 27/71] monitoring: unify namespace share panels --- scripts/render_dashboards.py | 25 ++++++++++++++----- .../monitoring/dashboards/atlas-overview.json | 4 +-- .../grafana-dashboard-overview.yaml | 4 +-- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index cf34d6a..3c0d6fa 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -165,6 +165,14 @@ def node_io_expr(scope=""): return scoped_node_expr(base, scope) +def namespace_cpu_share_expr(): + return f"({NAMESPACE_CPU_EXPR}) * on(namespace) group_left() ({NAMESPACE_COMBINED_FILTER})" + + +def namespace_ram_share_expr(): + return f"({NAMESPACE_RAM_EXPR}) * on(namespace) group_left() ({NAMESPACE_COMBINED_FILTER})" + + PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' @@ -199,12 +207,17 @@ STUCK_TABLE_EXPR = ( ) NAMESPACE_CPU_EXPR = ( - 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=""' - ',pod!=""}[5m])) by (namespace))' + 'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)' ) NAMESPACE_RAM_EXPR = ( - 'topk(10, sum(container_memory_working_set_bytes{namespace!=""' - ',pod!=""}) by (namespace))' + 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' +) +NAMESPACE_COMBINED_FILTER = ( + 'topk(10, (' + + NAMESPACE_CPU_EXPR + + ") + (" + + NAMESPACE_RAM_EXPR + + ' / 1e9))' ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( @@ -496,7 +509,7 @@ def build_overview(): pie_panel( 11, "Namespace CPU share", - NAMESPACE_CPU_EXPR, + namespace_cpu_share_expr(), {"h": 9, "w": 12, "x": 0, "y": 10}, ) ) @@ -504,7 +517,7 @@ def build_overview(): pie_panel( 12, "Namespace RAM share", - NAMESPACE_RAM_EXPR, + namespace_ram_share_expr(), {"h": 9, "w": 12, "x": 12, "y": 10}, ) ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index bd081a7..7529ae8 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", + "expr": "(sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", + "expr": "(sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index fb3d111..ea3523c 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", + "expr": "(sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", + "expr": "(sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))", "refId": "A", "legendFormat": "{{namespace}}" } From 37e51b361bed4512455086f0562ea44e82c71e9e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 22:06:06 -0300 Subject: [PATCH 28/71] monitoring: normalize namespace share --- scripts/render_dashboards.py | 18 ++++++++++++------ .../monitoring/dashboards/atlas-overview.json | 4 ++-- .../monitoring/grafana-dashboard-overview.yaml | 4 ++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 3c0d6fa..a3ffb94 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -166,11 +166,17 @@ def node_io_expr(scope=""): def namespace_cpu_share_expr(): - return f"({NAMESPACE_CPU_EXPR}) * on(namespace) group_left() ({NAMESPACE_COMBINED_FILTER})" + return ( + f"(100 * ( {NAMESPACE_CPU_RAW} ) / sum( {NAMESPACE_CPU_RAW} )) * on(namespace) group_left() " + f"( {NAMESPACE_COMBINED_FILTER} )" + ) def namespace_ram_share_expr(): - return f"({NAMESPACE_RAM_EXPR}) * on(namespace) group_left() ({NAMESPACE_COMBINED_FILTER})" + return ( + f"(100 * ( {NAMESPACE_RAM_RAW} ) / sum( {NAMESPACE_RAM_RAW} )) * on(namespace) group_left() " + f"( {NAMESPACE_COMBINED_FILTER} )" + ) PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' @@ -206,17 +212,17 @@ STUCK_TABLE_EXPR = ( ")" ) -NAMESPACE_CPU_EXPR = ( +NAMESPACE_CPU_RAW = ( 'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)' ) -NAMESPACE_RAM_EXPR = ( +NAMESPACE_RAM_RAW = ( 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' ) NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' - + NAMESPACE_CPU_EXPR + + NAMESPACE_CPU_RAW + ") + (" - + NAMESPACE_RAM_EXPR + + NAMESPACE_RAM_RAW + ' / 1e9))' ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 7529ae8..572f439 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "(sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))", + "expr": "(100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "(sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))", + "expr": "(100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index ea3523c..77f73ef 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "(sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))", + "expr": "(100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "(sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))", + "expr": "(100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )", "refId": "A", "legendFormat": "{{namespace}}" } From cc62f497e93f46c9f1f4d518c097fc529626fd46 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 22:19:01 -0300 Subject: [PATCH 29/71] monitoring: fix namespace share percentages --- scripts/render_dashboards.py | 12 ++++-------- services/monitoring/dashboards/atlas-overview.json | 4 ++-- services/monitoring/grafana-dashboard-overview.yaml | 4 ++-- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index a3ffb94..74c8f7a 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -166,17 +166,13 @@ def node_io_expr(scope=""): def namespace_cpu_share_expr(): - return ( - f"(100 * ( {NAMESPACE_CPU_RAW} ) / sum( {NAMESPACE_CPU_RAW} )) * on(namespace) group_left() " - f"( {NAMESPACE_COMBINED_FILTER} )" - ) + selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" + return f"100 * ( {selected} ) / sum( {selected} )" def namespace_ram_share_expr(): - return ( - f"(100 * ( {NAMESPACE_RAM_RAW} ) / sum( {NAMESPACE_RAM_RAW} )) * on(namespace) group_left() " - f"( {NAMESPACE_COMBINED_FILTER} )" - ) + selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" + return f"100 * ( {selected} ) / sum( {selected} )" PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 572f439..753a33d 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "(100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "(100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 77f73ef..97bc359 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "(100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "(100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )) * on(namespace) group_left() ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) )", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )", "refId": "A", "legendFormat": "{{namespace}}" } From 255e014e0a188544deb0d7f9d29288b567f24810 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 22:36:50 -0300 Subject: [PATCH 30/71] monitoring: color namespace pies --- scripts/render_dashboards.py | 35 +- .../monitoring/dashboards/atlas-overview.json | 322 +++++++++++++++++- .../grafana-dashboard-overview.yaml | 322 +++++++++++++++++- 3 files changed, 670 insertions(+), 9 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 74c8f7a..bec895a 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -361,6 +361,18 @@ def table_panel( def pie_panel(panel_id, title, expr, grid): """Return a pie chart panel with readable namespace labels.""" + palette = [ + "#EF476F", + "#FFD166", + "#06D6A0", + "#118AB2", + "#073B4C", + "#F78C6B", + "#8EECF5", + "#E0FF4F", + "#B5179E", + "#52B788", + ] return { "id": panel_id, "type": "piechart", @@ -368,7 +380,28 @@ def pie_panel(panel_id, title, expr, grid): "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], - "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []}, + "fieldConfig": { + "defaults": { + "unit": "percent", + "custom": {"gradientMode": "scheme"}, + "color": {"mode": "palette-classic"}, + }, + "overrides": [ + { + "matcher": {"id": "byIndex", "options": idx}, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": palette[idx % len(palette)], + }, + } + ], + } + for idx in range(10) + ], + }, "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 753a33d..8be7f9d 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -729,9 +729,166 @@ ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "custom": { + "gradientMode": "scheme" + }, + "color": { + "mode": "palette-classic" + } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byIndex", + "options": 0 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#EF476F" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 1 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#FFD166" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 2 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#06D6A0" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 3 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#118AB2" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 4 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#073B4C" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 5 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#F78C6B" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 6 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#8EECF5" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 7 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#E0FF4F" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 8 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#B5179E" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 9 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#52B788" + } + } + ] + } + ] }, "options": { "legend": { @@ -771,9 +928,166 @@ ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "custom": { + "gradientMode": "scheme" + }, + "color": { + "mode": "palette-classic" + } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byIndex", + "options": 0 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#EF476F" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 1 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#FFD166" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 2 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#06D6A0" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 3 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#118AB2" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 4 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#073B4C" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 5 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#F78C6B" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 6 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#8EECF5" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 7 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#E0FF4F" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 8 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#B5179E" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 9 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#52B788" + } + } + ] + } + ] }, "options": { "legend": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 97bc359..06a40c7 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -738,9 +738,166 @@ data: ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "custom": { + "gradientMode": "scheme" + }, + "color": { + "mode": "palette-classic" + } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byIndex", + "options": 0 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#EF476F" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 1 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#FFD166" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 2 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#06D6A0" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 3 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#118AB2" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 4 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#073B4C" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 5 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#F78C6B" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 6 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#8EECF5" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 7 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#E0FF4F" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 8 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#B5179E" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 9 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#52B788" + } + } + ] + } + ] }, "options": { "legend": { @@ -780,9 +937,166 @@ data: ], "fieldConfig": { "defaults": { - "unit": "percent" + "unit": "percent", + "custom": { + "gradientMode": "scheme" + }, + "color": { + "mode": "palette-classic" + } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byIndex", + "options": 0 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#EF476F" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 1 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#FFD166" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 2 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#06D6A0" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 3 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#118AB2" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 4 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#073B4C" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 5 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#F78C6B" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 6 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#8EECF5" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 7 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#E0FF4F" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 8 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#B5179E" + } + } + ] + }, + { + "matcher": { + "id": "byIndex", + "options": 9 + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#52B788" + } + } + ] + } + ] }, "options": { "legend": { From 442a89d327b8f198546b68a42154d1555f71c7c2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 22:39:50 -0300 Subject: [PATCH 31/71] monitoring: fix pie colors & thresholds --- scripts/render_dashboards.py | 41 +-- .../monitoring/dashboards/atlas-overview.json | 332 +----------------- .../grafana-dashboard-overview.yaml | 332 +----------------- 3 files changed, 22 insertions(+), 683 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index bec895a..2b0af09 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -361,18 +361,6 @@ def table_panel( def pie_panel(panel_id, title, expr, grid): """Return a pie chart panel with readable namespace labels.""" - palette = [ - "#EF476F", - "#FFD166", - "#06D6A0", - "#118AB2", - "#073B4C", - "#F78C6B", - "#8EECF5", - "#E0FF4F", - "#B5179E", - "#52B788", - ] return { "id": panel_id, "type": "piechart", @@ -380,28 +368,7 @@ def pie_panel(panel_id, title, expr, grid): "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], - "fieldConfig": { - "defaults": { - "unit": "percent", - "custom": {"gradientMode": "scheme"}, - "color": {"mode": "palette-classic"}, - }, - "overrides": [ - { - "matcher": {"id": "byIndex", "options": idx}, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": palette[idx % len(palette)], - }, - } - ], - } - for idx in range(10) - ], - }, + "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []}, "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", @@ -482,7 +449,7 @@ def build_overview(): thresholds = { "mode": "absolute", "steps": [ - {"color": "red", "value": None}, + {"color": "red", "value": 0}, {"color": "orange", "value": WORKER_TOTAL - 2}, {"color": "yellow", "value": WORKER_TOTAL - 1}, {"color": "green", "value": WORKER_TOTAL}, @@ -492,7 +459,7 @@ def build_overview(): thresholds = { "mode": "absolute", "steps": [ - {"color": "red", "value": None}, + {"color": "red", "value": 0}, {"color": "green", "value": CONTROL_TOTAL}, ], } @@ -500,7 +467,7 @@ def build_overview(): thresholds = { "mode": "absolute", "steps": [ - {"color": "green", "value": None}, + {"color": "green", "value": 0}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 8be7f9d..4481904 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -38,7 +38,7 @@ "steps": [ { "color": "red", - "value": null + "value": 0 }, { "color": "orange", @@ -107,7 +107,7 @@ "steps": [ { "color": "red", - "value": null + "value": 0 }, { "color": "green", @@ -168,7 +168,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "yellow", @@ -243,7 +243,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "yellow", @@ -318,7 +318,7 @@ "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "yellow", @@ -729,166 +729,9 @@ ], "fieldConfig": { "defaults": { - "unit": "percent", - "custom": { - "gradientMode": "scheme" - }, - "color": { - "mode": "palette-classic" - } + "unit": "percent" }, - "overrides": [ - { - "matcher": { - "id": "byIndex", - "options": 0 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#EF476F" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 1 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#FFD166" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 2 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#06D6A0" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 3 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#118AB2" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 4 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#073B4C" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 5 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#F78C6B" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 6 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#8EECF5" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 7 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#E0FF4F" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 8 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#B5179E" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 9 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#52B788" - } - } - ] - } - ] + "overrides": [] }, "options": { "legend": { @@ -928,166 +771,9 @@ ], "fieldConfig": { "defaults": { - "unit": "percent", - "custom": { - "gradientMode": "scheme" - }, - "color": { - "mode": "palette-classic" - } + "unit": "percent" }, - "overrides": [ - { - "matcher": { - "id": "byIndex", - "options": 0 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#EF476F" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 1 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#FFD166" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 2 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#06D6A0" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 3 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#118AB2" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 4 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#073B4C" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 5 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#F78C6B" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 6 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#8EECF5" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 7 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#E0FF4F" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 8 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#B5179E" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 9 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#52B788" - } - } - ] - } - ] + "overrides": [] }, "options": { "legend": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 06a40c7..926c71c 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -47,7 +47,7 @@ data: "steps": [ { "color": "red", - "value": null + "value": 0 }, { "color": "orange", @@ -116,7 +116,7 @@ data: "steps": [ { "color": "red", - "value": null + "value": 0 }, { "color": "green", @@ -177,7 +177,7 @@ data: "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "yellow", @@ -252,7 +252,7 @@ data: "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "yellow", @@ -327,7 +327,7 @@ data: "steps": [ { "color": "green", - "value": null + "value": 0 }, { "color": "yellow", @@ -738,166 +738,9 @@ data: ], "fieldConfig": { "defaults": { - "unit": "percent", - "custom": { - "gradientMode": "scheme" - }, - "color": { - "mode": "palette-classic" - } + "unit": "percent" }, - "overrides": [ - { - "matcher": { - "id": "byIndex", - "options": 0 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#EF476F" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 1 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#FFD166" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 2 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#06D6A0" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 3 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#118AB2" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 4 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#073B4C" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 5 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#F78C6B" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 6 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#8EECF5" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 7 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#E0FF4F" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 8 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#B5179E" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 9 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#52B788" - } - } - ] - } - ] + "overrides": [] }, "options": { "legend": { @@ -937,166 +780,9 @@ data: ], "fieldConfig": { "defaults": { - "unit": "percent", - "custom": { - "gradientMode": "scheme" - }, - "color": { - "mode": "palette-classic" - } + "unit": "percent" }, - "overrides": [ - { - "matcher": { - "id": "byIndex", - "options": 0 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#EF476F" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 1 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#FFD166" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 2 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#06D6A0" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 3 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#118AB2" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 4 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#073B4C" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 5 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#F78C6B" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 6 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#8EECF5" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 7 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#E0FF4F" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 8 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#B5179E" - } - } - ] - }, - { - "matcher": { - "id": "byIndex", - "options": 9 - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#52B788" - } - } - ] - } - ] + "overrides": [] }, "options": { "legend": { From c53c5183012e452ffbc15a719f2f6cf09aa48898 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 22:58:57 -0300 Subject: [PATCH 32/71] monitoring: express namespace share as cluster percent --- scripts/render_dashboards.py | 4 ++-- services/monitoring/dashboards/atlas-overview.json | 4 ++-- services/monitoring/grafana-dashboard-overview.yaml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 2b0af09..4e8e5a5 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -167,12 +167,12 @@ def node_io_expr(scope=""): def namespace_cpu_share_expr(): selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - return f"100 * ( {selected} ) / sum( {selected} )" + return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )" def namespace_ram_share_expr(): selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - return f"100 * ( {selected} ) / sum( {selected} )" + return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )" PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 4481904..55c1909 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 926c71c..deeeacc 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) )", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", "refId": "A", "legendFormat": "{{namespace}}" } From 0708522b280fb3d0978f75458e979360f353f740 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 23:12:16 -0300 Subject: [PATCH 33/71] monitoring: add namespace gpu share --- scripts/render_dashboards.py | 46 ++++++++---- .../monitoring/dashboards/atlas-overview.json | 72 +++++++++++++++---- .../grafana-dashboard-overview.yaml | 72 +++++++++++++++---- 3 files changed, 145 insertions(+), 45 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 4e8e5a5..1248984 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -175,6 +175,11 @@ def namespace_ram_share_expr(): return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )" +def namespace_gpu_share_expr(): + selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" + return f"100 * ( {selected} ) / sum( {NAMESPACE_GPU_RAW} )" + + PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' @@ -214,6 +219,9 @@ NAMESPACE_CPU_RAW = ( NAMESPACE_RAM_RAW = ( 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' ) +NAMESPACE_GPU_RAW = ( + 'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)' +) NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' + NAMESPACE_CPU_RAW @@ -512,7 +520,7 @@ def build_overview(): 11, "Namespace CPU share", namespace_cpu_share_expr(), - {"h": 9, "w": 12, "x": 0, "y": 10}, + {"h": 9, "w": 8, "x": 0, "y": 10}, ) ) panels.append( @@ -520,14 +528,22 @@ def build_overview(): 12, "Namespace RAM share", namespace_ram_share_expr(), - {"h": 9, "w": 12, "x": 12, "y": 10}, + {"h": 9, "w": 8, "x": 8, "y": 10}, + ) + ) + panels.append( + pie_panel( + 13, + "Namespace GPU share", + namespace_gpu_share_expr(), + {"h": 9, "w": 8, "x": 16, "y": 10}, ) ) worker_filter = f"{WORKER_REGEX}" panels.append( timeseries_panel( - 13, + 14, "Worker node CPU", node_cpu_expr(worker_filter), {"h": 8, "w": 12, "x": 0, "y": 19}, @@ -541,7 +557,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 14, + 15, "Worker node RAM", node_mem_expr(worker_filter), {"h": 8, "w": 12, "x": 12, "y": 19}, @@ -556,7 +572,7 @@ def build_overview(): panels.append( timeseries_panel( - 15, + 16, "Control plane CPU", node_cpu_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 0, "y": 27}, @@ -568,7 +584,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 16, + 17, "Control plane RAM", node_mem_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 12, "y": 27}, @@ -581,7 +597,7 @@ def build_overview(): panels.append( timeseries_panel( - 17, + 18, "Cluster ingress throughput", NET_INGRESS_EXPR, {"h": 7, "w": 12, "x": 0, "y": 34}, @@ -593,7 +609,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 18, + 19, "Cluster egress throughput", NET_EGRESS_EXPR, {"h": 7, "w": 12, "x": 12, "y": 34}, @@ -606,7 +622,7 @@ def build_overview(): panels.append( timeseries_panel( - 19, + 20, "Root filesystem usage", root_usage_expr(), {"h": 8, "w": 12, "x": 0, "y": 41}, @@ -621,7 +637,7 @@ def build_overview(): ) panels.append( { - "id": 20, + "id": 21, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": PROM_DS, @@ -655,10 +671,10 @@ def build_overview(): ) storage_panels = [ - (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), - (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), - (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), - (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), + (22, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), + (23, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), + (24, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), + (25, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), ] for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): panels.append( @@ -675,7 +691,7 @@ def build_overview(): panels.append( text_panel( - 25, + 26, "About this dashboard", textwrap.dedent( """\ diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 55c1909..47aa5da 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -716,7 +716,7 @@ }, "gridPos": { "h": 9, - "w": 12, + "w": 8, "x": 0, "y": 10 }, @@ -758,8 +758,8 @@ }, "gridPos": { "h": 9, - "w": 12, - "x": 12, + "w": 8, + "x": 8, "y": 10 }, "targets": [ @@ -792,6 +792,48 @@ }, { "id": 13, + "type": "piechart", + "title": "Namespace GPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 10 + }, + "targets": [ + { + "expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 14, "type": "timeseries", "title": "Worker node CPU", "datasource": { @@ -838,7 +880,7 @@ ] }, { - "id": 14, + "id": 15, "type": "timeseries", "title": "Worker node RAM", "datasource": { @@ -885,7 +927,7 @@ ] }, { - "id": 15, + "id": 16, "type": "timeseries", "title": "Control plane CPU", "datasource": { @@ -922,7 +964,7 @@ } }, { - "id": 16, + "id": 17, "type": "timeseries", "title": "Control plane RAM", "datasource": { @@ -959,7 +1001,7 @@ } }, { - "id": 17, + "id": 18, "type": "timeseries", "title": "Cluster ingress throughput", "datasource": { @@ -1002,7 +1044,7 @@ ] }, { - "id": 18, + "id": 19, "type": "timeseries", "title": "Cluster egress throughput", "datasource": { @@ -1045,7 +1087,7 @@ ] }, { - "id": 19, + "id": 20, "type": "timeseries", "title": "Root filesystem usage", "datasource": { @@ -1093,7 +1135,7 @@ ] }, { - "id": 20, + "id": 21, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -1162,7 +1204,7 @@ ] }, { - "id": 21, + "id": 22, "type": "stat", "title": "Astreae usage", "datasource": { @@ -1233,7 +1275,7 @@ ] }, { - "id": 22, + "id": 23, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1304,7 +1346,7 @@ ] }, { - "id": 23, + "id": 24, "type": "stat", "title": "Astreae free", "datasource": { @@ -1371,7 +1413,7 @@ ] }, { - "id": 24, + "id": 25, "type": "stat", "title": "Asteria free", "datasource": { @@ -1438,7 +1480,7 @@ ] }, { - "id": 25, + "id": 26, "type": "text", "title": "About this dashboard", "gridPos": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index deeeacc..96136d7 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -725,7 +725,7 @@ data: }, "gridPos": { "h": 9, - "w": 12, + "w": 8, "x": 0, "y": 10 }, @@ -767,8 +767,8 @@ data: }, "gridPos": { "h": 9, - "w": 12, - "x": 12, + "w": 8, + "x": 8, "y": 10 }, "targets": [ @@ -801,6 +801,48 @@ data: }, { "id": 13, + "type": "piechart", + "title": "Namespace GPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 10 + }, + "targets": [ + { + "expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 14, "type": "timeseries", "title": "Worker node CPU", "datasource": { @@ -847,7 +889,7 @@ data: ] }, { - "id": 14, + "id": 15, "type": "timeseries", "title": "Worker node RAM", "datasource": { @@ -894,7 +936,7 @@ data: ] }, { - "id": 15, + "id": 16, "type": "timeseries", "title": "Control plane CPU", "datasource": { @@ -931,7 +973,7 @@ data: } }, { - "id": 16, + "id": 17, "type": "timeseries", "title": "Control plane RAM", "datasource": { @@ -968,7 +1010,7 @@ data: } }, { - "id": 17, + "id": 18, "type": "timeseries", "title": "Cluster ingress throughput", "datasource": { @@ -1011,7 +1053,7 @@ data: ] }, { - "id": 18, + "id": 19, "type": "timeseries", "title": "Cluster egress throughput", "datasource": { @@ -1054,7 +1096,7 @@ data: ] }, { - "id": 19, + "id": 20, "type": "timeseries", "title": "Root filesystem usage", "datasource": { @@ -1102,7 +1144,7 @@ data: ] }, { - "id": 20, + "id": 21, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -1171,7 +1213,7 @@ data: ] }, { - "id": 21, + "id": 22, "type": "stat", "title": "Astreae usage", "datasource": { @@ -1242,7 +1284,7 @@ data: ] }, { - "id": 22, + "id": 23, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1313,7 +1355,7 @@ data: ] }, { - "id": 23, + "id": 24, "type": "stat", "title": "Astreae free", "datasource": { @@ -1380,7 +1422,7 @@ data: ] }, { - "id": 24, + "id": 25, "type": "stat", "title": "Asteria free", "datasource": { @@ -1447,7 +1489,7 @@ data: ] }, { - "id": 25, + "id": 26, "type": "text", "title": "About this dashboard", "gridPos": { From f4dd1de43fccf81562606223d89dcb076fb187a2 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 23:18:53 -0300 Subject: [PATCH 34/71] monitoring: reorder namespace pies and add gpu data --- scripts/render_dashboards.py | 28 ++++++++++++------- .../monitoring/dashboards/atlas-overview.json | 12 ++++---- .../grafana-dashboard-overview.yaml | 12 ++++---- 3 files changed, 30 insertions(+), 22 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 1248984..1c4aef2 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -167,17 +167,20 @@ def node_io_expr(scope=""): def namespace_cpu_share_expr(): selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )" + total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)" + return f"100 * ( {selected} ) / {total}" def namespace_ram_share_expr(): selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )" + total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)" + return f"100 * ( {selected} ) / {total}" def namespace_gpu_share_expr(): selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - return f"100 * ( {selected} ) / sum( {NAMESPACE_GPU_RAW} )" + total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" + return f"100 * ( {selected} ) / {total}" PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' @@ -222,12 +225,17 @@ NAMESPACE_RAM_RAW = ( NAMESPACE_GPU_RAW = ( 'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)' ) +NAMESPACE_GPU_RAW = ( + 'sum(kube_pod_container_resource_requests{resource="nvidia.com/gpu",namespace!=""}) by (namespace)' +) NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' + NAMESPACE_CPU_RAW + ") + (" + NAMESPACE_RAM_RAW - + ' / 1e9))' + + ' / 1e9) + (' + + NAMESPACE_GPU_RAW + + ' * 10))' ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( @@ -518,24 +526,24 @@ def build_overview(): panels.append( pie_panel( 11, - "Namespace CPU share", - namespace_cpu_share_expr(), + "Namespace GPU share", + namespace_gpu_share_expr(), {"h": 9, "w": 8, "x": 0, "y": 10}, ) ) panels.append( pie_panel( 12, - "Namespace RAM share", - namespace_ram_share_expr(), + "Namespace CPU share", + namespace_cpu_share_expr(), {"h": 9, "w": 8, "x": 8, "y": 10}, ) ) panels.append( pie_panel( 13, - "Namespace GPU share", - namespace_gpu_share_expr(), + "Namespace RAM share", + namespace_ram_share_expr(), {"h": 9, "w": 8, "x": 16, "y": 10}, ) ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 47aa5da..f833b89 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -709,7 +709,7 @@ { "id": 11, "type": "piechart", - "title": "Namespace CPU share", + "title": "Namespace GPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -751,7 +751,7 @@ { "id": 12, "type": "piechart", - "title": "Namespace RAM share", + "title": "Namespace CPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -793,7 +793,7 @@ { "id": 13, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace RAM share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -806,7 +806,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 96136d7..fb4e13a 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -718,7 +718,7 @@ data: { "id": 11, "type": "piechart", - "title": "Namespace CPU share", + "title": "Namespace GPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -760,7 +760,7 @@ data: { "id": 12, "type": "piechart", - "title": "Namespace RAM share", + "title": "Namespace CPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -802,7 +802,7 @@ data: { "id": 13, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace RAM share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -815,7 +815,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_gpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } From aef3176c1cf5e0196ecc0fa7641cde53c79c576b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 23:40:22 -0300 Subject: [PATCH 35/71] monitoring: fix hottest stats and gpu share --- scripts/render_dashboards.py | 2 +- services/monitoring/dashboards/atlas-overview.json | 8 ++++---- services/monitoring/grafana-dashboard-overview.yaml | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 1c4aef2..a09eeae 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -145,7 +145,7 @@ def astreae_free_expr(mount): def topk_with_node(expr): - return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")' + return f"topk(1, {expr})" def node_net_expr(scope=""): diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index f833b89..1bb0b53 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -438,7 +438,7 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -511,7 +511,7 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -584,7 +584,7 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -653,7 +653,7 @@ }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", "legendFormat": "{{node}}", "instant": true diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index fb4e13a..f2ef289 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -447,7 +447,7 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -520,7 +520,7 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -593,7 +593,7 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -662,7 +662,7 @@ data: }, "targets": [ { - "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", + "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", "legendFormat": "{{node}}", "instant": true From beb3243839343349cfe2803ea9a8be634d9fc72c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 23:42:55 -0300 Subject: [PATCH 36/71] Revert GPU pie chart additions --- scripts/render_dashboards.py | 66 +++++--------- .../monitoring/dashboards/atlas-overview.json | 88 +++++-------------- .../grafana-dashboard-overview.yaml | 88 +++++-------------- 3 files changed, 67 insertions(+), 175 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index a09eeae..4e8e5a5 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -145,7 +145,7 @@ def astreae_free_expr(mount): def topk_with_node(expr): - return f"topk(1, {expr})" + return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")' def node_net_expr(scope=""): @@ -167,20 +167,12 @@ def node_io_expr(scope=""): def namespace_cpu_share_expr(): selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)" - return f"100 * ( {selected} ) / {total}" + return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )" def namespace_ram_share_expr(): selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)" - return f"100 * ( {selected} ) / {total}" - - -def namespace_gpu_share_expr(): - selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" - return f"100 * ( {selected} ) / {total}" + return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )" PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' @@ -222,20 +214,12 @@ NAMESPACE_CPU_RAW = ( NAMESPACE_RAM_RAW = ( 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' ) -NAMESPACE_GPU_RAW = ( - 'sum(rate(container_gpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)' -) -NAMESPACE_GPU_RAW = ( - 'sum(kube_pod_container_resource_requests{resource="nvidia.com/gpu",namespace!=""}) by (namespace)' -) NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' + NAMESPACE_CPU_RAW + ") + (" + NAMESPACE_RAM_RAW - + ' / 1e9) + (' - + NAMESPACE_GPU_RAW - + ' * 10))' + + ' / 1e9))' ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( @@ -526,32 +510,24 @@ def build_overview(): panels.append( pie_panel( 11, - "Namespace GPU share", - namespace_gpu_share_expr(), - {"h": 9, "w": 8, "x": 0, "y": 10}, + "Namespace CPU share", + namespace_cpu_share_expr(), + {"h": 9, "w": 12, "x": 0, "y": 10}, ) ) panels.append( pie_panel( 12, - "Namespace CPU share", - namespace_cpu_share_expr(), - {"h": 9, "w": 8, "x": 8, "y": 10}, - ) - ) - panels.append( - pie_panel( - 13, "Namespace RAM share", namespace_ram_share_expr(), - {"h": 9, "w": 8, "x": 16, "y": 10}, + {"h": 9, "w": 12, "x": 12, "y": 10}, ) ) worker_filter = f"{WORKER_REGEX}" panels.append( timeseries_panel( - 14, + 13, "Worker node CPU", node_cpu_expr(worker_filter), {"h": 8, "w": 12, "x": 0, "y": 19}, @@ -565,7 +541,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 15, + 14, "Worker node RAM", node_mem_expr(worker_filter), {"h": 8, "w": 12, "x": 12, "y": 19}, @@ -580,7 +556,7 @@ def build_overview(): panels.append( timeseries_panel( - 16, + 15, "Control plane CPU", node_cpu_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 0, "y": 27}, @@ -592,7 +568,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 17, + 16, "Control plane RAM", node_mem_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 12, "y": 27}, @@ -605,7 +581,7 @@ def build_overview(): panels.append( timeseries_panel( - 18, + 17, "Cluster ingress throughput", NET_INGRESS_EXPR, {"h": 7, "w": 12, "x": 0, "y": 34}, @@ -617,7 +593,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 19, + 18, "Cluster egress throughput", NET_EGRESS_EXPR, {"h": 7, "w": 12, "x": 12, "y": 34}, @@ -630,7 +606,7 @@ def build_overview(): panels.append( timeseries_panel( - 20, + 19, "Root filesystem usage", root_usage_expr(), {"h": 8, "w": 12, "x": 0, "y": 41}, @@ -645,7 +621,7 @@ def build_overview(): ) panels.append( { - "id": 21, + "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": PROM_DS, @@ -679,10 +655,10 @@ def build_overview(): ) storage_panels = [ - (22, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), - (23, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), - (24, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), - (25, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), + (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), + (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), + (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), + (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), ] for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): panels.append( @@ -699,7 +675,7 @@ def build_overview(): panels.append( text_panel( - 26, + 25, "About this dashboard", textwrap.dedent( """\ diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 1bb0b53..55c1909 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -438,7 +438,7 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -511,7 +511,7 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -584,7 +584,7 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -653,7 +653,7 @@ }, "targets": [ { - "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -709,20 +709,20 @@ { "id": 11, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace CPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 8, + "w": 12, "x": 0, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -751,20 +751,20 @@ { "id": 12, "type": "piechart", - "title": "Namespace CPU share", + "title": "Namespace RAM share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 8, - "x": 8, + "w": 12, + "x": 12, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -792,48 +792,6 @@ }, { "id": 13, - "type": "piechart", - "title": "Namespace RAM share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 14, "type": "timeseries", "title": "Worker node CPU", "datasource": { @@ -880,7 +838,7 @@ ] }, { - "id": 15, + "id": 14, "type": "timeseries", "title": "Worker node RAM", "datasource": { @@ -927,7 +885,7 @@ ] }, { - "id": 16, + "id": 15, "type": "timeseries", "title": "Control plane CPU", "datasource": { @@ -964,7 +922,7 @@ } }, { - "id": 17, + "id": 16, "type": "timeseries", "title": "Control plane RAM", "datasource": { @@ -1001,7 +959,7 @@ } }, { - "id": 18, + "id": 17, "type": "timeseries", "title": "Cluster ingress throughput", "datasource": { @@ -1044,7 +1002,7 @@ ] }, { - "id": 19, + "id": 18, "type": "timeseries", "title": "Cluster egress throughput", "datasource": { @@ -1087,7 +1045,7 @@ ] }, { - "id": 20, + "id": 19, "type": "timeseries", "title": "Root filesystem usage", "datasource": { @@ -1135,7 +1093,7 @@ ] }, { - "id": 21, + "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -1204,7 +1162,7 @@ ] }, { - "id": 22, + "id": 21, "type": "stat", "title": "Astreae usage", "datasource": { @@ -1275,7 +1233,7 @@ ] }, { - "id": 23, + "id": 22, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1346,7 +1304,7 @@ ] }, { - "id": 24, + "id": 23, "type": "stat", "title": "Astreae free", "datasource": { @@ -1413,7 +1371,7 @@ ] }, { - "id": 25, + "id": 24, "type": "stat", "title": "Asteria free", "datasource": { @@ -1480,7 +1438,7 @@ ] }, { - "id": 26, + "id": 25, "type": "text", "title": "About this dashboard", "gridPos": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index f2ef289..deeeacc 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -447,7 +447,7 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -520,7 +520,7 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -593,7 +593,7 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -662,7 +662,7 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))), \"__name__\", \"$1\", \"node\", \"(.*)\")", "refId": "A", "legendFormat": "{{node}}", "instant": true @@ -718,20 +718,20 @@ data: { "id": 11, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace CPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 8, + "w": 12, "x": 0, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -760,20 +760,20 @@ data: { "id": 12, "type": "piechart", - "title": "Namespace CPU share", + "title": "Namespace RAM share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 8, - "x": 8, + "w": 12, + "x": 12, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", "refId": "A", "legendFormat": "{{namespace}}" } @@ -801,48 +801,6 @@ data: }, { "id": 13, - "type": "piechart", - "title": "Namespace RAM share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum(kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\",namespace!=\"\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 14, "type": "timeseries", "title": "Worker node CPU", "datasource": { @@ -889,7 +847,7 @@ data: ] }, { - "id": 15, + "id": 14, "type": "timeseries", "title": "Worker node RAM", "datasource": { @@ -936,7 +894,7 @@ data: ] }, { - "id": 16, + "id": 15, "type": "timeseries", "title": "Control plane CPU", "datasource": { @@ -973,7 +931,7 @@ data: } }, { - "id": 17, + "id": 16, "type": "timeseries", "title": "Control plane RAM", "datasource": { @@ -1010,7 +968,7 @@ data: } }, { - "id": 18, + "id": 17, "type": "timeseries", "title": "Cluster ingress throughput", "datasource": { @@ -1053,7 +1011,7 @@ data: ] }, { - "id": 19, + "id": 18, "type": "timeseries", "title": "Cluster egress throughput", "datasource": { @@ -1096,7 +1054,7 @@ data: ] }, { - "id": 20, + "id": 19, "type": "timeseries", "title": "Root filesystem usage", "datasource": { @@ -1144,7 +1102,7 @@ data: ] }, { - "id": 21, + "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -1213,7 +1171,7 @@ data: ] }, { - "id": 22, + "id": 21, "type": "stat", "title": "Astreae usage", "datasource": { @@ -1284,7 +1242,7 @@ data: ] }, { - "id": 23, + "id": 22, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1355,7 +1313,7 @@ data: ] }, { - "id": 24, + "id": 23, "type": "stat", "title": "Astreae free", "datasource": { @@ -1422,7 +1380,7 @@ data: ] }, { - "id": 25, + "id": 24, "type": "stat", "title": "Asteria free", "datasource": { @@ -1489,7 +1447,7 @@ data: ] }, { - "id": 26, + "id": 25, "type": "text", "title": "About this dashboard", "gridPos": { From 2ba642d49f92b20057ab914687830c3d6edf449c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 00:11:39 -0300 Subject: [PATCH 37/71] monitoring: add gpu pie and tidy net panels --- scripts/render_dashboards.py | 51 ++++-- .../monitoring/dashboards/atlas-overview.json | 157 +++++++++++------- .../grafana-dashboard-overview.yaml | 157 +++++++++++------- 3 files changed, 239 insertions(+), 126 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 4e8e5a5..c194771 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -167,12 +167,20 @@ def node_io_expr(scope=""): def namespace_cpu_share_expr(): selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - return f"100 * ( {selected} ) / sum( {NAMESPACE_CPU_RAW} )" + total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)" + return f"100 * ( {selected} ) / {total}" def namespace_ram_share_expr(): selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - return f"100 * ( {selected} ) / sum( {NAMESPACE_RAM_RAW} )" + total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)" + return f"100 * ( {selected} ) / {total}" + + +def namespace_gpu_share_expr(): + selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" + total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" + return f"100 * ( {selected} ) / {total}" PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' @@ -214,12 +222,17 @@ NAMESPACE_CPU_RAW = ( NAMESPACE_RAM_RAW = ( 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' ) +NAMESPACE_GPU_RAW = ( + 'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}) by (namespace)' +) NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' + NAMESPACE_CPU_RAW + ") + (" + NAMESPACE_RAM_RAW - + ' / 1e9))' + + ' / 1e9) + ( ' + + NAMESPACE_GPU_RAW + + ' * 10))' ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( @@ -512,22 +525,30 @@ def build_overview(): 11, "Namespace CPU share", namespace_cpu_share_expr(), - {"h": 9, "w": 12, "x": 0, "y": 10}, + {"h": 9, "w": 8, "x": 0, "y": 10}, ) ) panels.append( pie_panel( 12, + "Namespace GPU share", + namespace_gpu_share_expr(), + {"h": 9, "w": 8, "x": 8, "y": 10}, + ) + ) + panels.append( + pie_panel( + 13, "Namespace RAM share", namespace_ram_share_expr(), - {"h": 9, "w": 12, "x": 12, "y": 10}, + {"h": 9, "w": 8, "x": 16, "y": 10}, ) ) worker_filter = f"{WORKER_REGEX}" panels.append( timeseries_panel( - 13, + 14, "Worker node CPU", node_cpu_expr(worker_filter), {"h": 8, "w": 12, "x": 0, "y": 19}, @@ -541,7 +562,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 14, + 15, "Worker node RAM", node_mem_expr(worker_filter), {"h": 8, "w": 12, "x": 12, "y": 19}, @@ -556,7 +577,7 @@ def build_overview(): panels.append( timeseries_panel( - 15, + 16, "Control plane CPU", node_cpu_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 0, "y": 27}, @@ -568,7 +589,7 @@ def build_overview(): ) panels.append( timeseries_panel( - 16, + 17, "Control plane RAM", node_mem_expr(CONTROL_REGEX), {"h": 7, "w": 12, "x": 12, "y": 27}, @@ -581,11 +602,12 @@ def build_overview(): panels.append( timeseries_panel( - 17, + 18, "Cluster ingress throughput", NET_INGRESS_EXPR, {"h": 7, "w": 12, "x": 0, "y": 34}, unit="Bps", + legend="Ingress", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), @@ -593,11 +615,12 @@ def build_overview(): ) panels.append( timeseries_panel( - 18, + 19, "Cluster egress throughput", NET_EGRESS_EXPR, {"h": 7, "w": 12, "x": 12, "y": 34}, unit="Bps", + legend="Egress", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), @@ -606,7 +629,7 @@ def build_overview(): panels.append( timeseries_panel( - 19, + 20, "Root filesystem usage", root_usage_expr(), {"h": 8, "w": 12, "x": 0, "y": 41}, @@ -621,12 +644,12 @@ def build_overview(): ) panels.append( { - "id": 20, + "id": 21, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": PROM_DS, "gridPos": {"h": 8, "w": 12, "x": 12, "y": 41}, - "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A"}], + "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}], "fieldConfig": { "defaults": { "unit": "percent", diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 55c1909..0b2f69f 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -716,13 +716,13 @@ }, "gridPos": { "h": 9, - "w": 12, + "w": 8, "x": 0, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -751,20 +751,20 @@ { "id": 12, "type": "piechart", - "title": "Namespace RAM share", + "title": "Namespace GPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 12, - "x": 12, + "w": 8, + "x": 8, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", + "expr": "100 * ( ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -792,6 +792,48 @@ }, { "id": 13, + "type": "piechart", + "title": "Namespace RAM share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 10 + }, + "targets": [ + { + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 14, "type": "timeseries", "title": "Worker node CPU", "datasource": { @@ -838,7 +880,7 @@ ] }, { - "id": 14, + "id": 15, "type": "timeseries", "title": "Worker node RAM", "datasource": { @@ -885,7 +927,7 @@ ] }, { - "id": 15, + "id": 16, "type": "timeseries", "title": "Control plane CPU", "datasource": { @@ -922,7 +964,7 @@ } }, { - "id": 16, + "id": 17, "type": "timeseries", "title": "Control plane RAM", "datasource": { @@ -959,7 +1001,7 @@ } }, { - "id": 17, + "id": 18, "type": "timeseries", "title": "Cluster ingress throughput", "datasource": { @@ -975,50 +1017,8 @@ "targets": [ { "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-network dashboard", - "url": "/d/atlas-network", - "targetBlank": true - } - ] - }, - { - "id": 18, - "type": "timeseries", - "title": "Cluster egress throughput", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", - "refId": "A" + "refId": "A", + "legendFormat": "Ingress" } ], "fieldConfig": { @@ -1047,6 +1047,50 @@ { "id": 19, "type": "timeseries", + "title": "Cluster egress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 34 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A", + "legendFormat": "Egress" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 20, + "type": "timeseries", "title": "Root filesystem usage", "datasource": { "type": "prometheus", @@ -1093,7 +1137,7 @@ ] }, { - "id": 20, + "id": 21, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -1109,7 +1153,8 @@ "targets": [ { "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A" + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index deeeacc..0ac79db 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -725,13 +725,13 @@ data: }, "gridPos": { "h": 9, - "w": 12, + "w": 8, "x": 0, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) )", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -760,20 +760,20 @@ data: { "id": 12, "type": "piechart", - "title": "Namespace RAM share", + "title": "Namespace GPU share", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 12, - "x": 12, + "w": 8, + "x": 8, "y": 10 }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)) ) ) / sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) )", + "expr": "100 * ( ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -801,6 +801,48 @@ data: }, { "id": 13, + "type": "piechart", + "title": "Namespace RAM share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 10 + }, + "targets": [ + { + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 14, "type": "timeseries", "title": "Worker node CPU", "datasource": { @@ -847,7 +889,7 @@ data: ] }, { - "id": 14, + "id": 15, "type": "timeseries", "title": "Worker node RAM", "datasource": { @@ -894,7 +936,7 @@ data: ] }, { - "id": 15, + "id": 16, "type": "timeseries", "title": "Control plane CPU", "datasource": { @@ -931,7 +973,7 @@ data: } }, { - "id": 16, + "id": 17, "type": "timeseries", "title": "Control plane RAM", "datasource": { @@ -968,7 +1010,7 @@ data: } }, { - "id": 17, + "id": 18, "type": "timeseries", "title": "Cluster ingress throughput", "datasource": { @@ -984,50 +1026,8 @@ data: "targets": [ { "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-network dashboard", - "url": "/d/atlas-network", - "targetBlank": true - } - ] - }, - { - "id": 18, - "type": "timeseries", - "title": "Cluster egress throughput", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", - "refId": "A" + "refId": "A", + "legendFormat": "Ingress" } ], "fieldConfig": { @@ -1056,6 +1056,50 @@ data: { "id": 19, "type": "timeseries", + "title": "Cluster egress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 34 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "refId": "A", + "legendFormat": "Egress" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 20, + "type": "timeseries", "title": "Root filesystem usage", "datasource": { "type": "prometheus", @@ -1102,7 +1146,7 @@ data: ] }, { - "id": 20, + "id": 21, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -1118,7 +1162,8 @@ data: "targets": [ { "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A" + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { From ac62387e07956a0d31f66f3d7c7c34b9fdc908d7 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 00:19:45 -0300 Subject: [PATCH 38/71] monitoring: stabilize namespace pies and labels --- scripts/render_dashboards.py | 19 ++++++++++++++----- .../monitoring/dashboards/atlas-overview.json | 9 ++++----- .../grafana-dashboard-overview.yaml | 9 ++++----- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index c194771..d6436ce 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -178,7 +178,10 @@ def namespace_ram_share_expr(): def namespace_gpu_share_expr(): - selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" + selected = ( + f"(( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} ))" + f" or on(namespace) ( {NAMESPACE_COMBINED_FILTER} * 0)" + ) total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" return f"100 * ( {selected} ) / {total}" @@ -225,14 +228,21 @@ NAMESPACE_RAM_RAW = ( NAMESPACE_GPU_RAW = ( 'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}) by (namespace)' ) +NAMESPACE_GPU_WEIGHT = ( + "(( " + + NAMESPACE_GPU_RAW + + " ) or on(namespace) ( " + + NAMESPACE_CPU_RAW + + " * 0))" +) NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' + NAMESPACE_CPU_RAW + ") + (" + NAMESPACE_RAM_RAW - + ' / 1e9) + ( ' - + NAMESPACE_GPU_RAW - + ' * 10))' + + ' / 1e9) + (' + + NAMESPACE_GPU_WEIGHT + + " * 10))" ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( @@ -664,7 +674,6 @@ def build_overview(): {"color": "red", "value": 85}, ], }, - "displayName": "{{node}}", }, "overrides": [], }, diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 0b2f69f..7d808c9 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", + "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) )) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -806,7 +806,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1182,8 +1182,7 @@ "value": 85 } ] - }, - "displayName": "{{node}}" + } }, "overrides": [] }, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 0ac79db..02b65f3 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", + "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) )) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -815,7 +815,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1191,8 +1191,7 @@ data: "value": 85 } ] - }, - "displayName": "{{node}}" + } }, "overrides": [] }, From 5144bbe1f290fc9a2a98d50d8f3f3008894df1bb Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 00:31:51 -0300 Subject: [PATCH 39/71] monitoring: fix gpu pie data and network panels --- scripts/render_dashboards.py | 18 +++++------------- .../monitoring/dashboards/atlas-network.json | 4 ++-- .../monitoring/dashboards/atlas-overview.json | 10 +++++----- .../monitoring/grafana-dashboard-network.yaml | 4 ++-- .../monitoring/grafana-dashboard-overview.yaml | 10 +++++----- 5 files changed, 19 insertions(+), 27 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index d6436ce..fbed073 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -178,10 +178,7 @@ def namespace_ram_share_expr(): def namespace_gpu_share_expr(): - selected = ( - f"(( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} ))" - f" or on(namespace) ( {NAMESPACE_COMBINED_FILTER} * 0)" - ) + selected = f"(( {NAMESPACE_GPU_RAW} ) or on(namespace) ( {NAMESPACE_COMBINED_FILTER} * 0)) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" return f"100 * ( {selected} ) / {total}" @@ -226,7 +223,8 @@ NAMESPACE_RAM_RAW = ( 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' ) NAMESPACE_GPU_RAW = ( - 'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}) by (namespace)' + 'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"} ' + 'or kube_pod_resource_request{namespace!="",resource="nvidia.com/gpu"}) by (namespace)' ) NAMESPACE_GPU_WEIGHT = ( "(( " @@ -245,14 +243,8 @@ NAMESPACE_COMBINED_FILTER = ( + " * 10))" ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" -NET_INGRESS_EXPR = ( - 'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) ' - "or on() vector(0)" -) -NET_EGRESS_EXPR = ( - 'sum(rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m])) ' - "or on() vector(0)" -) +NET_INGRESS_EXPR = 'sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m])) or on() vector(0)' +NET_EGRESS_EXPR = 'sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m])) or on() vector(0)' # --------------------------------------------------------------------------- # Panel factories diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 0363b81..27da627 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -80,7 +80,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 7d808c9..eb3f11d 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) )) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", + "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0)) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -806,7 +806,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1016,7 +1016,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Ingress" } @@ -1060,7 +1060,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Egress" } diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 2d7d989..1b70159 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -89,7 +89,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 02b65f3..e7ddd48 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) )) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", + "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0)) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -815,7 +815,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1025,7 +1025,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Ingress" } @@ -1069,7 +1069,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Egress" } From ec76563a8677cbf5926dcafb13014af8fac028e5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 01:01:10 -0300 Subject: [PATCH 40/71] monitoring: source gpu pie from limits and node nets --- scripts/render_dashboards.py | 7 ++++--- services/monitoring/dashboards/atlas-overview.json | 6 +++--- services/monitoring/grafana-dashboard-overview.yaml | 6 +++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index fbed073..0916969 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -178,7 +178,7 @@ def namespace_ram_share_expr(): def namespace_gpu_share_expr(): - selected = f"(( {NAMESPACE_GPU_RAW} ) or on(namespace) ( {NAMESPACE_COMBINED_FILTER} * 0)) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" + selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" return f"100 * ( {selected} ) / {total}" @@ -223,8 +223,9 @@ NAMESPACE_RAM_RAW = ( 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' ) NAMESPACE_GPU_RAW = ( - 'sum(kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"} ' - 'or kube_pod_resource_request{namespace!="",resource="nvidia.com/gpu"}) by (namespace)' + 'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}' + ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})' + ') by (namespace)' ) NAMESPACE_GPU_WEIGHT = ( "(( " diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index eb3f11d..f6d42c1 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0)) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -806,7 +806,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index e7ddd48..bf7b710 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "100 * ( (( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) * 0)) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -815,7 +815,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum(kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_resource_request{namespace!=\"\",resource=\"nvidia.com/gpu\"}) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } From d7e4bcd53315d5231c0760635479173e76d5c526 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 10:47:24 -0300 Subject: [PATCH 41/71] monitoring: add gpu node fallback --- scripts/render_dashboards.py | 24 ++++++++++++------- .../monitoring/dashboards/atlas-overview.json | 6 ++--- .../grafana-dashboard-overview.yaml | 6 ++--- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 0916969..44a0de1 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -222,18 +222,24 @@ NAMESPACE_CPU_RAW = ( NAMESPACE_RAM_RAW = ( 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' ) -NAMESPACE_GPU_RAW = ( +GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] +GPU_NODE_REGEX = "|".join(GPU_NODES) +NAMESPACE_GPU_REQUEST = ( 'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}' - ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})' - ') by (namespace)' + ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)' ) -NAMESPACE_GPU_WEIGHT = ( - "(( " - + NAMESPACE_GPU_RAW - + " ) or on(namespace) ( " - + NAMESPACE_CPU_RAW - + " * 0))" +NAMESPACE_GPU_FALLBACK = ( + 'sum by (namespace) (kube_pod_info{namespace!=""}' + f' and on(node) kube_node_info{{node=~"{GPU_NODE_REGEX}"}})' ) +NAMESPACE_GPU_RAW = ( + "(" + + NAMESPACE_GPU_REQUEST + + ") or on(namespace) group_left() (" + + NAMESPACE_GPU_FALLBACK + + ")" +) +NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_RAW NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' + NAMESPACE_CPU_RAW diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index f6d42c1..11634d9 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ), 1)", + "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -806,7 +806,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index bf7b710..f243cf8 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ), 1)", + "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -815,7 +815,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((( sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) ) or on(namespace) ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0)) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } From 7009a4f9ff25b908f38c9046a8a15929b436d5ae Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 11:12:03 -0300 Subject: [PATCH 42/71] monitoring: fix namespace gpu share and network stats --- scripts/render_dashboards.py | 14 ++++++++------ .../monitoring/dashboards/atlas-network.json | 4 ++-- .../monitoring/dashboards/atlas-overview.json | 16 ++++++++-------- .../monitoring/grafana-dashboard-network.yaml | 4 ++-- .../monitoring/grafana-dashboard-overview.yaml | 16 ++++++++-------- 5 files changed, 28 insertions(+), 26 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 44a0de1..b53c8c7 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -178,7 +178,9 @@ def namespace_ram_share_expr(): def namespace_gpu_share_expr(): - selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" + selected = ( + f"( {NAMESPACE_GPU_RAW} ) + on(namespace) group_left() (0 * {NAMESPACE_COMBINED_FILTER})" + ) total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" return f"100 * ( {selected} ) / {total}" @@ -250,8 +252,8 @@ NAMESPACE_COMBINED_FILTER = ( + " * 10))" ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" -NET_INGRESS_EXPR = 'sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m])) or on() vector(0)' -NET_EGRESS_EXPR = 'sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m])) or on() vector(0)' +NET_INGRESS_EXPR = 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))' +NET_EGRESS_EXPR = 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))' # --------------------------------------------------------------------------- # Panel factories @@ -471,7 +473,7 @@ def build_overview(): thresholds = { "mode": "absolute", "steps": [ - {"color": "red", "value": 0}, + {"color": "red", "value": None}, {"color": "orange", "value": WORKER_TOTAL - 2}, {"color": "yellow", "value": WORKER_TOTAL - 1}, {"color": "green", "value": WORKER_TOTAL}, @@ -481,7 +483,7 @@ def build_overview(): thresholds = { "mode": "absolute", "steps": [ - {"color": "red", "value": 0}, + {"color": "red", "value": None}, {"color": "green", "value": CONTROL_TOTAL}, ], } @@ -489,7 +491,7 @@ def build_overview(): thresholds = { "mode": "absolute", "steps": [ - {"color": "green", "value": 0}, + {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 27da627..a4daa0c 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A" } ], @@ -80,7 +80,7 @@ }, "targets": [ { - "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 11634d9..16e01b3 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -38,7 +38,7 @@ "steps": [ { "color": "red", - "value": 0 + "value": null }, { "color": "orange", @@ -107,7 +107,7 @@ "steps": [ { "color": "red", - "value": 0 + "value": null }, { "color": "green", @@ -168,7 +168,7 @@ "steps": [ { "color": "green", - "value": 0 + "value": null }, { "color": "yellow", @@ -243,7 +243,7 @@ "steps": [ { "color": "green", - "value": 0 + "value": null }, { "color": "yellow", @@ -318,7 +318,7 @@ "steps": [ { "color": "green", - "value": 0 + "value": null }, { "color": "yellow", @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", + "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1016,7 +1016,7 @@ }, "targets": [ { - "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A", "legendFormat": "Ingress" } @@ -1060,7 +1060,7 @@ }, "targets": [ { - "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A", "legendFormat": "Egress" } diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 1b70159..029987a 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A" } ], @@ -89,7 +89,7 @@ data: }, "targets": [ { - "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index f243cf8..55afe64 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -47,7 +47,7 @@ data: "steps": [ { "color": "red", - "value": 0 + "value": null }, { "color": "orange", @@ -116,7 +116,7 @@ data: "steps": [ { "color": "red", - "value": 0 + "value": null }, { "color": "green", @@ -177,7 +177,7 @@ data: "steps": [ { "color": "green", - "value": 0 + "value": null }, { "color": "yellow", @@ -252,7 +252,7 @@ data: "steps": [ { "color": "green", - "value": 0 + "value": null }, { "color": "yellow", @@ -327,7 +327,7 @@ data: "steps": [ { "color": "green", - "value": 0 + "value": null }, { "color": "yellow", @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", + "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1025,7 +1025,7 @@ data: }, "targets": [ { - "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A", "legendFormat": "Ingress" } @@ -1069,7 +1069,7 @@ data: }, "targets": [ { - "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A", "legendFormat": "Egress" } From fab5552039b4ee3c4d72e76203c7fe40bac6cee0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 11:30:33 -0300 Subject: [PATCH 43/71] monitoring: resolve pie errors and network data --- scripts/render_dashboards.py | 21 ++++++++++++------- .../monitoring/dashboards/atlas-network.json | 4 ++-- .../monitoring/dashboards/atlas-overview.json | 10 ++++----- .../monitoring/grafana-dashboard-network.yaml | 4 ++-- .../grafana-dashboard-overview.yaml | 10 ++++----- 5 files changed, 27 insertions(+), 22 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index b53c8c7..8e9bc8a 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -178,9 +178,7 @@ def namespace_ram_share_expr(): def namespace_gpu_share_expr(): - selected = ( - f"( {NAMESPACE_GPU_RAW} ) + on(namespace) group_left() (0 * {NAMESPACE_COMBINED_FILTER})" - ) + selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" return f"100 * ( {selected} ) / {total}" @@ -231,13 +229,14 @@ NAMESPACE_GPU_REQUEST = ( ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)' ) NAMESPACE_GPU_FALLBACK = ( - 'sum by (namespace) (kube_pod_info{namespace!=""}' - f' and on(node) kube_node_info{{node=~"{GPU_NODE_REGEX}"}})' + 'sum by (namespace) (kube_pod_info{namespace!="",node=~"' + + GPU_NODE_REGEX + + '"})' ) NAMESPACE_GPU_RAW = ( "(" + NAMESPACE_GPU_REQUEST - + ") or on(namespace) group_left() (" + + ") or on(namespace) (" + NAMESPACE_GPU_FALLBACK + ")" ) @@ -252,8 +251,14 @@ NAMESPACE_COMBINED_FILTER = ( + " * 10))" ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" -NET_INGRESS_EXPR = 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))' -NET_EGRESS_EXPR = 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))' +NET_INGRESS_EXPR = ( + 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))' + " or on() vector(0)" +) +NET_EGRESS_EXPR = ( + 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))' + " or on() vector(0)" +) # --------------------------------------------------------------------------- # Panel factories diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index a4daa0c..098e1db 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -80,7 +80,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 16e01b3..5772c2c 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", + "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -806,7 +806,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1016,7 +1016,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Ingress" } @@ -1060,7 +1060,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Egress" } diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 029987a..a552793 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -89,7 +89,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 55afe64..00755a9 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", + "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -815,7 +815,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1025,7 +1025,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Ingress" } @@ -1069,7 +1069,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Egress" } From 497164a1ad4b49632414f83e54c89cac97049543 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 11:42:24 -0300 Subject: [PATCH 44/71] monitoring: clean namespace gpu share and layout --- scripts/render_dashboards.py | 29 ++++--------------- .../monitoring/dashboards/atlas-overview.json | 6 ++-- .../grafana-dashboard-overview.yaml | 6 ++-- 3 files changed, 12 insertions(+), 29 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 8e9bc8a..bce5bfe 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -224,21 +224,19 @@ NAMESPACE_RAM_RAW = ( ) GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) -NAMESPACE_GPU_REQUEST = ( +NAMESPACE_GPU_ALLOC = ( 'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}' ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)' ) -NAMESPACE_GPU_FALLBACK = ( - 'sum by (namespace) (kube_pod_info{namespace!="",node=~"' - + GPU_NODE_REGEX - + '"})' +NAMESPACE_GPU_USAGE = ( + 'sum(rate(container_accelerator_duty_cycle{namespace!="",accelerator="nvidia.com/gpu"}[5m])) by (namespace)' ) NAMESPACE_GPU_RAW = ( "(" - + NAMESPACE_GPU_REQUEST + + NAMESPACE_GPU_USAGE + ") or on(namespace) (" - + NAMESPACE_GPU_FALLBACK - + ")" + + NAMESPACE_GPU_ALLOC + + " * 0)" ) NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_RAW NAMESPACE_COMBINED_FILTER = ( @@ -711,21 +709,6 @@ def build_overview(): ) ) - panels.append( - text_panel( - 25, - "About this dashboard", - textwrap.dedent( - """\ -### Atlas Overview -- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs. -- Control plane workload count flags any non-system pods that slipped onto the HA nodes. -- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly.""" - ), - {"h": 5, "w": 24, "x": 0, "y": 55}, - ) - ) - return { "uid": "atlas-overview", "title": "Atlas Overview", diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 5772c2c..5953697 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", + "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -806,7 +806,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 00755a9..f8b40af 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", + "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -815,7 +815,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum by (namespace) (kube_pod_info{namespace!=\"\",node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } From 8e6c0a3cfefd5ce7e30030f42ba47b64eb74c6d8 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 12:11:47 -0300 Subject: [PATCH 45/71] monitoring: rework gpu share + gauges --- scripts/render_dashboards.py | 63 +++++++- .../monitoring/dashboards/atlas-overview.json | 144 +++++------------- .../grafana-dashboard-overview.yaml | 144 +++++------------- 3 files changed, 144 insertions(+), 207 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index bce5bfe..937dfb7 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -238,7 +238,7 @@ NAMESPACE_GPU_RAW = ( + NAMESPACE_GPU_ALLOC + " * 0)" ) -NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_RAW +NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_ALLOC NAMESPACE_COMBINED_FILTER = ( 'topk(10, (' + NAMESPACE_CPU_RAW @@ -319,6 +319,49 @@ def stat_panel( return panel +def gauge_panel( + panel_id, + title, + expr, + grid, + *, + min_value=0, + max_value=1, + thresholds=None, + links=None, +): + return { + "id": panel_id, + "type": "gauge", + "title": title, + "datasource": PROM_DS, + "gridPos": grid, + "targets": [{"expr": expr, "refId": "A"}], + "fieldConfig": { + "defaults": { + "min": min_value, + "max": max_value, + "thresholds": thresholds + or { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": max_value}, + ], + }, + }, + "overrides": [], + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, + "orientation": "auto", + "showThresholdMarkers": False, + "showThresholdLabels": False, + }, + **({"links": links} if links else {}), + } + + def timeseries_panel( panel_id, title, @@ -472,7 +515,10 @@ def build_overview(): ] for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): thresholds = None + min_value = 0 + max_value = ok_value or 5 if panel_id == 1: + max_value = WORKER_TOTAL thresholds = { "mode": "absolute", "steps": [ @@ -483,6 +529,7 @@ def build_overview(): ], } elif panel_id == 2: + max_value = CONTROL_TOTAL thresholds = { "mode": "absolute", "steps": [ @@ -491,6 +538,7 @@ def build_overview(): ], } elif panel_id in (3, 4, 5): + max_value = 4 thresholds = { "mode": "absolute", "steps": [ @@ -500,13 +548,22 @@ def build_overview(): {"color": "red", "value": 3}, ], } + else: + thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "red", "value": max_value}, + ], + } panels.append( - stat_panel( + gauge_panel( panel_id, title, expr, {"h": 5, "w": 4, "x": 4 * idx, "y": 0}, - value_suffix=suffix, + min_value=min_value, + max_value=max_value, thresholds=thresholds, links=links, ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 5953697..ad460bb 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -9,7 +9,7 @@ "panels": [ { "id": 1, - "type": "stat", + "type": "gauge", "title": "Workers ready", "datasource": { "type": "prometheus", @@ -29,10 +29,8 @@ ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 18, "thresholds": { "mode": "absolute", "steps": [ @@ -53,19 +51,11 @@ "value": 18 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto", - "valueSuffix": "/18" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -73,12 +63,14 @@ "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false } }, { "id": 2, - "type": "stat", + "type": "gauge", "title": "Control plane ready", "datasource": { "type": "prometheus", @@ -98,10 +90,8 @@ ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 3, "thresholds": { "mode": "absolute", "steps": [ @@ -114,19 +104,11 @@ "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto", - "valueSuffix": "/3" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -134,12 +116,14 @@ "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false } }, { "id": 3, - "type": "stat", + "type": "gauge", "title": "Control plane workloads", "datasource": { "type": "prometheus", @@ -159,10 +143,8 @@ ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 4, "thresholds": { "mode": "absolute", "steps": [ @@ -183,18 +165,11 @@ "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -202,7 +177,9 @@ "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false }, "links": [ { @@ -214,7 +191,7 @@ }, { "id": 4, - "type": "stat", + "type": "gauge", "title": "Problem pods", "datasource": { "type": "prometheus", @@ -234,10 +211,8 @@ ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 4, "thresholds": { "mode": "absolute", "steps": [ @@ -258,18 +233,11 @@ "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -277,7 +245,9 @@ "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false }, "links": [ { @@ -289,7 +259,7 @@ }, { "id": 5, - "type": "stat", + "type": "gauge", "title": "Stuck terminating", "datasource": { "type": "prometheus", @@ -309,10 +279,8 @@ ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 4, "thresholds": { "mode": "absolute", "steps": [ @@ -333,18 +301,11 @@ "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -352,7 +313,9 @@ "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false }, "links": [ { @@ -364,7 +327,7 @@ }, { "id": 6, - "type": "stat", + "type": "gauge", "title": "Running pods", "datasource": { "type": "prometheus", @@ -384,34 +347,25 @@ ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 5, "thresholds": { "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "red", + "value": 5 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -419,7 +373,9 @@ "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false } }, { @@ -722,7 +678,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +720,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", + "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -806,7 +762,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1480,22 +1436,6 @@ "targetBlank": true } ] - }, - { - "id": 25, - "type": "text", - "title": "About this dashboard", - "gridPos": { - "h": 5, - "w": 24, - "x": 0, - "y": 55 - }, - "datasource": null, - "options": { - "mode": "markdown", - "content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly." - } } ], "schemaVersion": 39, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index f8b40af..6503da9 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -18,7 +18,7 @@ data: "panels": [ { "id": 1, - "type": "stat", + "type": "gauge", "title": "Workers ready", "datasource": { "type": "prometheus", @@ -38,10 +38,8 @@ data: ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 18, "thresholds": { "mode": "absolute", "steps": [ @@ -62,19 +60,11 @@ data: "value": 18 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto", - "valueSuffix": "/18" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -82,12 +72,14 @@ data: "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false } }, { "id": 2, - "type": "stat", + "type": "gauge", "title": "Control plane ready", "datasource": { "type": "prometheus", @@ -107,10 +99,8 @@ data: ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 3, "thresholds": { "mode": "absolute", "steps": [ @@ -123,19 +113,11 @@ data: "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto", - "valueSuffix": "/3" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -143,12 +125,14 @@ data: "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false } }, { "id": 3, - "type": "stat", + "type": "gauge", "title": "Control plane workloads", "datasource": { "type": "prometheus", @@ -168,10 +152,8 @@ data: ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 4, "thresholds": { "mode": "absolute", "steps": [ @@ -192,18 +174,11 @@ data: "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -211,7 +186,9 @@ data: "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false }, "links": [ { @@ -223,7 +200,7 @@ data: }, { "id": 4, - "type": "stat", + "type": "gauge", "title": "Problem pods", "datasource": { "type": "prometheus", @@ -243,10 +220,8 @@ data: ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 4, "thresholds": { "mode": "absolute", "steps": [ @@ -267,18 +242,11 @@ data: "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -286,7 +254,9 @@ data: "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false }, "links": [ { @@ -298,7 +268,7 @@ data: }, { "id": 5, - "type": "stat", + "type": "gauge", "title": "Stuck terminating", "datasource": { "type": "prometheus", @@ -318,10 +288,8 @@ data: ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 4, "thresholds": { "mode": "absolute", "steps": [ @@ -342,18 +310,11 @@ data: "value": 3 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -361,7 +322,9 @@ data: "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false }, "links": [ { @@ -373,7 +336,7 @@ data: }, { "id": 6, - "type": "stat", + "type": "gauge", "title": "Running pods", "datasource": { "type": "prometheus", @@ -393,34 +356,25 @@ data: ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 5, "thresholds": { "mode": "absolute", "steps": [ { - "color": "rgba(115, 115, 115, 1)", + "color": "green", "value": null }, { - "color": "green", - "value": 1 + "color": "red", + "value": 5 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -428,7 +382,9 @@ data: "fields": "", "values": false }, - "textMode": "value" + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false } }, { @@ -731,7 +687,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +729,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", + "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -815,7 +771,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1489,22 +1445,6 @@ data: "targetBlank": true } ] - }, - { - "id": 25, - "type": "text", - "title": "About this dashboard", - "gridPos": { - "h": 5, - "w": 24, - "x": 0, - "y": 55 - }, - "datasource": null, - "options": { - "mode": "markdown", - "content": "### Atlas Overview\n- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.\n- Control plane workload count flags any non-system pods that slipped onto the HA nodes.\n- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly." - } } ], "schemaVersion": 39, From ff056551c7d6fb7cc64dfed73680f8b7a13421e5 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 14:08:33 -0300 Subject: [PATCH 46/71] monitoring: refresh overview dashboards --- scripts/render_dashboards.py | 182 +-- .../monitoring/dashboards/atlas-network.json | 86 +- .../monitoring/dashboards/atlas-overview.json | 1150 +++++++++-------- services/monitoring/dcgm-exporter.yaml | 74 ++ .../monitoring/grafana-dashboard-network.yaml | 86 +- .../grafana-dashboard-overview.yaml | 1150 +++++++++-------- services/monitoring/kustomization.yaml | 1 + 7 files changed, 1511 insertions(+), 1218 deletions(-) create mode 100644 services/monitoring/dcgm-exporter.yaml diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 937dfb7..273090a 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -165,22 +165,22 @@ def node_io_expr(scope=""): return scoped_node_expr(base, scope) -def namespace_cpu_share_expr(): - selected = f"( {NAMESPACE_CPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - total = f"clamp_min(sum( {NAMESPACE_CPU_RAW} ), 1)" +def namespace_share_expr(resource_expr): + selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )" + total = f"clamp_min(sum( {resource_expr} ), 1)" return f"100 * ( {selected} ) / {total}" +def namespace_cpu_share_expr(): + return namespace_share_expr(NAMESPACE_CPU_RAW) + + def namespace_ram_share_expr(): - selected = f"( {NAMESPACE_RAM_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - total = f"clamp_min(sum( {NAMESPACE_RAM_RAW} ), 1)" - return f"100 * ( {selected} ) / {total}" + return namespace_share_expr(NAMESPACE_RAM_RAW) def namespace_gpu_share_expr(): - selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" - total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" - return f"100 * ( {selected} ) / {total}" + return namespace_share_expr(NAMESPACE_GPU_RAW) PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' @@ -228,35 +228,47 @@ NAMESPACE_GPU_ALLOC = ( 'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}' ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)' ) -NAMESPACE_GPU_USAGE = ( - 'sum(rate(container_accelerator_duty_cycle{namespace!="",accelerator="nvidia.com/gpu"}[5m])) by (namespace)' -) +NAMESPACE_GPU_USAGE = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)' NAMESPACE_GPU_RAW = ( "(" + NAMESPACE_GPU_USAGE + ") or on(namespace) (" - + NAMESPACE_GPU_ALLOC + + NAMESPACE_CPU_RAW + " * 0)" ) -NAMESPACE_GPU_WEIGHT = NAMESPACE_GPU_ALLOC -NAMESPACE_COMBINED_FILTER = ( - 'topk(10, (' +NAMESPACE_GPU_WEIGHT = ( + "(" + + NAMESPACE_GPU_ALLOC + + ") or on(namespace) (" + NAMESPACE_CPU_RAW - + ") + (" - + NAMESPACE_RAM_RAW - + ' / 1e9) + (' - + NAMESPACE_GPU_WEIGHT - + " * 10))" + + " * 0)" ) +NAMESPACE_ACTIVITY_SCORE = ( + "( " + + NAMESPACE_CPU_RAW + + " ) + (" + + NAMESPACE_RAM_RAW + + " / 1e9) + (" + + NAMESPACE_GPU_WEIGHT + + " * 100)" +) +NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)" TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" -NET_INGRESS_EXPR = ( - 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))' +TRAEFIK_NET_INGRESS = ( + 'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' " or on() vector(0)" ) -NET_EGRESS_EXPR = ( +TRAEFIK_NET_EGRESS = ( + 'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' + " or on() vector(0)" +) +NET_TOTAL_EXPR = ( 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))' " or on() vector(0)" ) +NET_INGRESS_EXPR = TRAEFIK_NET_INGRESS +NET_EGRESS_EXPR = TRAEFIK_NET_EGRESS +NET_INTERNAL_EXPR = f"clamp_min(({NET_TOTAL_EXPR}) - ({TRAEFIK_NET_EGRESS}), 0)" # --------------------------------------------------------------------------- # Panel factories @@ -438,10 +450,20 @@ def pie_panel(panel_id, title, expr, grid): "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], - "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []}, + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": {"mode": "palette-classic"}, + }, + "overrides": [], + }, "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", + "displayLabels": ["percent"], + "tooltip": {"mode": "single"}, + "colorScheme": "interpolateSpectral", + "colorBy": "value", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, } @@ -511,7 +533,6 @@ def build_overview(): 1, link_to("atlas-pods"), ), - (6, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None), ] for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): thresholds = None @@ -591,12 +612,31 @@ def build_overview(): ) ) + storage_panels = [ + (23, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), + (24, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), + (25, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), + (26, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), + ] + for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): + panels.append( + stat_panel( + panel_id, + title, + expr, + {"h": 6, "w": 6, "x": 6 * idx, "y": 10}, + unit=unit, + thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, + links=link_to("atlas-storage"), + ) + ) + panels.append( pie_panel( 11, "Namespace CPU share", namespace_cpu_share_expr(), - {"h": 9, "w": 8, "x": 0, "y": 10}, + {"h": 9, "w": 8, "x": 0, "y": 16}, ) ) panels.append( @@ -604,7 +644,7 @@ def build_overview(): 12, "Namespace GPU share", namespace_gpu_share_expr(), - {"h": 9, "w": 8, "x": 8, "y": 10}, + {"h": 9, "w": 8, "x": 8, "y": 16}, ) ) panels.append( @@ -612,7 +652,7 @@ def build_overview(): 13, "Namespace RAM share", namespace_ram_share_expr(), - {"h": 9, "w": 8, "x": 16, "y": 10}, + {"h": 9, "w": 8, "x": 16, "y": 16}, ) ) @@ -622,7 +662,7 @@ def build_overview(): 14, "Worker node CPU", node_cpu_expr(worker_filter), - {"h": 8, "w": 12, "x": 0, "y": 19}, + {"h": 8, "w": 12, "x": 0, "y": 25}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -636,7 +676,7 @@ def build_overview(): 15, "Worker node RAM", node_mem_expr(worker_filter), - {"h": 8, "w": 12, "x": 12, "y": 19}, + {"h": 8, "w": 12, "x": 12, "y": 25}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -651,7 +691,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_REGEX), - {"h": 7, "w": 12, "x": 0, "y": 27}, + {"h": 7, "w": 12, "x": 0, "y": 33}, unit="percent", legend="{{node}}", legend_display="table", @@ -663,7 +703,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_REGEX), - {"h": 7, "w": 12, "x": 12, "y": 27}, + {"h": 7, "w": 12, "x": 12, "y": 33}, unit="percent", legend="{{node}}", legend_display="table", @@ -676,9 +716,9 @@ def build_overview(): 18, "Cluster ingress throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 12, "x": 0, "y": 34}, + {"h": 7, "w": 8, "x": 0, "y": 40}, unit="Bps", - legend="Ingress", + legend="Ingress (Traefik)", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), @@ -689,9 +729,22 @@ def build_overview(): 19, "Cluster egress throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 12, "x": 12, "y": 34}, + {"h": 7, "w": 8, "x": 8, "y": 40}, unit="Bps", - legend="Egress", + legend="Egress (Traefik)", + legend_display="list", + legend_placement="bottom", + links=link_to("atlas-network"), + ) + ) + panels.append( + timeseries_panel( + 20, + "Intra-cluster throughput", + NET_INTERNAL_EXPR, + {"h": 7, "w": 8, "x": 16, "y": 40}, + unit="Bps", + legend="Internal traffic", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), @@ -700,10 +753,10 @@ def build_overview(): panels.append( timeseries_panel( - 20, + 21, "Root filesystem usage", root_usage_expr(), - {"h": 8, "w": 12, "x": 0, "y": 41}, + {"h": 8, "w": 12, "x": 0, "y": 47}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -715,11 +768,11 @@ def build_overview(): ) panels.append( { - "id": 21, + "id": 22, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": PROM_DS, - "gridPos": {"h": 8, "w": 12, "x": 12, "y": 41}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 47}, "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}], "fieldConfig": { "defaults": { @@ -744,28 +797,10 @@ def build_overview(): "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, "links": link_to("atlas-storage"), + "transformations": [{"id": "labelsToFields", "options": {}}], } ) - storage_panels = [ - (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), - (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), - (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), - (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), - ] - for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): - panels.append( - stat_panel( - panel_id, - title, - expr, - {"h": 6, "w": 6, "x": 6 * idx, "y": 49}, - unit=unit, - thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, - links=link_to("atlas-storage"), - ) - ) - return { "uid": "atlas-overview", "title": "Atlas Overview", @@ -1110,12 +1145,15 @@ def build_network_dashboard(): panels.append( stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps") ) + panels.append( + stat_panel(3, "Intra-cluster traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 0}, unit="Bps") + ) panels.append( stat_panel( - 3, + 4, "Top router req/s", f"topk(1, {TRAEFIK_ROUTER_EXPR})", - {"h": 4, "w": 8, "x": 16, "y": 0}, + {"h": 4, "w": 8, "x": 0, "y": 4}, unit="req/s", legend="{{router}}", instant=True, @@ -1123,10 +1161,10 @@ def build_network_dashboard(): ) panels.append( timeseries_panel( - 4, + 5, "Per-node throughput", node_net_expr(), - {"h": 8, "w": 24, "x": 0, "y": 4}, + {"h": 8, "w": 24, "x": 0, "y": 8}, unit="Bps", legend="{{node}}", legend_display="table", @@ -1135,32 +1173,32 @@ def build_network_dashboard(): ) panels.append( table_panel( - 5, + 6, "Top namespaces", 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', - {"h": 9, "w": 12, "x": 0, "y": 12}, + {"h": 9, "w": 12, "x": 0, "y": 16}, unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( - 6, + 7, "Top pods", 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', - {"h": 9, "w": 12, "x": 12, "y": 12}, + {"h": 9, "w": 12, "x": 12, "y": 16}, unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( timeseries_panel( - 7, + 8, "Traefik routers (req/s)", f"topk(10, {TRAEFIK_ROUTER_EXPR})", - {"h": 9, "w": 12, "x": 0, "y": 21}, + {"h": 9, "w": 12, "x": 0, "y": 25}, unit="req/s", legend="{{router}}", legend_display="table", @@ -1169,10 +1207,10 @@ def build_network_dashboard(): ) panels.append( timeseries_panel( - 8, + 9, "Traefik entrypoints (req/s)", 'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))', - {"h": 9, "w": 12, "x": 12, "y": 21}, + {"h": 9, "w": 12, "x": 12, "y": 25}, unit="req/s", legend="{{entrypoint}}", legend_display="table", diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 098e1db..1baec3a 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -80,7 +80,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -127,7 +127,7 @@ { "id": 3, "type": "stat", - "title": "Top router req/s", + "title": "Intra-cluster traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -138,6 +138,66 @@ "x": 16, "y": 0 }, + "targets": [ + { + "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "Bps", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Top router req/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 4 + }, "targets": [ { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", @@ -187,7 +247,7 @@ } }, { - "id": 4, + "id": 5, "type": "timeseries", "title": "Per-node throughput", "datasource": { @@ -198,7 +258,7 @@ "h": 8, "w": 24, "x": 0, - "y": 4 + "y": 8 }, "targets": [ { @@ -224,7 +284,7 @@ } }, { - "id": 5, + "id": 6, "type": "table", "title": "Top namespaces", "datasource": { @@ -235,7 +295,7 @@ "h": 9, "w": 12, "x": 0, - "y": 12 + "y": 16 }, "targets": [ { @@ -260,7 +320,7 @@ ] }, { - "id": 6, + "id": 7, "type": "table", "title": "Top pods", "datasource": { @@ -271,7 +331,7 @@ "h": 9, "w": 12, "x": 12, - "y": 12 + "y": 16 }, "targets": [ { @@ -296,7 +356,7 @@ ] }, { - "id": 7, + "id": 8, "type": "timeseries", "title": "Traefik routers (req/s)", "datasource": { @@ -307,7 +367,7 @@ "h": 9, "w": 12, "x": 0, - "y": 21 + "y": 25 }, "targets": [ { @@ -333,7 +393,7 @@ } }, { - "id": 8, + "id": 9, "type": "timeseries", "title": "Traefik entrypoints (req/s)", "datasource": { @@ -344,7 +404,7 @@ "h": 9, "w": 12, "x": 12, - "y": 21 + "y": 25 }, "targets": [ { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index ad460bb..eba6466 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -325,59 +325,6 @@ } ] }, - { - "id": 6, - "type": "gauge", - "title": "Running pods", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 20, - "y": 0 - }, - "targets": [ - { - "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "min": 0, - "max": 5, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "options": { - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false - } - }, { "id": 7, "type": "stat", @@ -663,506 +610,7 @@ ] }, { - "id": 11, - "type": "piechart", - "title": "Namespace CPU share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 0, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 12, - "type": "piechart", - "title": "Namespace GPU share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 8, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 13, - "type": "piechart", - "title": "Namespace RAM share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 14, - "type": "timeseries", - "title": "Worker node CPU", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 19 - }, - "targets": [ - { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-nodes dashboard", - "url": "/d/atlas-nodes", - "targetBlank": true - } - ] - }, - { - "id": 15, - "type": "timeseries", - "title": "Worker node RAM", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 19 - }, - "targets": [ - { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-nodes dashboard", - "url": "/d/atlas-nodes", - "targetBlank": true - } - ] - }, - { - "id": 16, - "type": "timeseries", - "title": "Control plane CPU", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 27 - }, - "targets": [ - { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 17, - "type": "timeseries", - "title": "Control plane RAM", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 27 - }, - "targets": [ - { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 18, - "type": "timeseries", - "title": "Cluster ingress throughput", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", - "refId": "A", - "legendFormat": "Ingress" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-network dashboard", - "url": "/d/atlas-network", - "targetBlank": true - } - ] - }, - { - "id": 19, - "type": "timeseries", - "title": "Cluster egress throughput", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", - "refId": "A", - "legendFormat": "Egress" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-network dashboard", - "url": "/d/atlas-network", - "targetBlank": true - } - ] - }, - { - "id": 20, - "type": "timeseries", - "title": "Root filesystem usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 41 - }, - "targets": [ - { - "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "timeFrom": "30d", - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 21, - "type": "bargauge", - "title": "Nodes closest to full root disks", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 41 - }, - "targets": [ - { - "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 70 - }, - { - "color": "red", - "value": 85 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 21, + "id": 23, "type": "stat", "title": "Astreae usage", "datasource": { @@ -1173,7 +621,7 @@ "h": 6, "w": 6, "x": 0, - "y": 49 + "y": 10 }, "targets": [ { @@ -1233,7 +681,7 @@ ] }, { - "id": 22, + "id": 24, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1244,7 +692,7 @@ "h": 6, "w": 6, "x": 6, - "y": 49 + "y": 10 }, "targets": [ { @@ -1304,7 +752,7 @@ ] }, { - "id": 23, + "id": 25, "type": "stat", "title": "Astreae free", "datasource": { @@ -1315,7 +763,7 @@ "h": 6, "w": 6, "x": 12, - "y": 49 + "y": 10 }, "targets": [ { @@ -1371,7 +819,7 @@ ] }, { - "id": 24, + "id": 26, "type": "stat", "title": "Asteria free", "datasource": { @@ -1382,7 +830,7 @@ "h": 6, "w": 6, "x": 18, - "y": 49 + "y": 10 }, "targets": [ { @@ -1436,6 +884,588 @@ "targetBlank": true } ] + }, + { + "id": 11, + "type": "piechart", + "title": "Namespace CPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 16 + }, + "targets": [ + { + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 12, + "type": "piechart", + "title": "Namespace GPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 16 + }, + "targets": [ + { + "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 13, + "type": "piechart", + "title": "Namespace RAM share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 16 + }, + "targets": [ + { + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 14, + "type": "timeseries", + "title": "Worker node CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "targets": [ + { + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 15, + "type": "timeseries", + "title": "Worker node RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "targets": [ + { + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 16, + "type": "timeseries", + "title": "Control plane CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 33 + }, + "targets": [ + { + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 17, + "type": "timeseries", + "title": "Control plane RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 33 + }, + "targets": [ + { + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 18, + "type": "timeseries", + "title": "Cluster ingress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 40 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "refId": "A", + "legendFormat": "Ingress (Traefik)" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 19, + "type": "timeseries", + "title": "Cluster egress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 40 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "refId": "A", + "legendFormat": "Egress (Traefik)" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 20, + "type": "timeseries", + "title": "Intra-cluster throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 40 + }, + "targets": [ + { + "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "refId": "A", + "legendFormat": "Internal traffic" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 21, + "type": "timeseries", + "title": "Root filesystem usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 47 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d", + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] + }, + { + "id": 22, + "type": "bargauge", + "title": "Nodes closest to full root disks", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 47 + }, + "targets": [ + { + "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ], + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] } ], "schemaVersion": 39, diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml new file mode 100644 index 0000000..efd32c5 --- /dev/null +++ b/services/monitoring/dcgm-exporter.yaml @@ -0,0 +1,74 @@ +# services/monitoring/dcgm-exporter.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: dcgm-exporter + namespace: monitoring + labels: + app: dcgm-exporter +spec: + selector: + matchLabels: + app: dcgm-exporter + template: + metadata: + labels: + app: dcgm-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9400" + spec: + serviceAccountName: default + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - titan-20 + - titan-21 + - titan-22 + - titan-24 + tolerations: + - operator: Exists + containers: + - name: dcgm-exporter + image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-1 + imagePullPolicy: IfNotPresent + ports: + - name: metrics + containerPort: 9400 + env: + - name: DCGM_EXPORTER_KUBERNETES + value: "true" + securityContext: + privileged: true + resources: + requests: + cpu: 50m + memory: 64Mi + volumeMounts: + - name: pod-resources + mountPath: /var/lib/kubelet/pod-resources + volumes: + - name: pod-resources + hostPath: + path: /var/lib/kubelet/pod-resources + type: Directory +--- +apiVersion: v1 +kind: Service +metadata: + name: dcgm-exporter + namespace: monitoring + labels: + app: dcgm-exporter +spec: + selector: + app: dcgm-exporter + ports: + - name: metrics + port: 9400 + targetPort: metrics diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index a552793..ade7457 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -89,7 +89,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -136,7 +136,7 @@ data: { "id": 3, "type": "stat", - "title": "Top router req/s", + "title": "Intra-cluster traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -147,6 +147,66 @@ data: "x": 16, "y": 0 }, + "targets": [ + { + "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "Bps", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Top router req/s", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 4 + }, "targets": [ { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", @@ -196,7 +256,7 @@ data: } }, { - "id": 4, + "id": 5, "type": "timeseries", "title": "Per-node throughput", "datasource": { @@ -207,7 +267,7 @@ data: "h": 8, "w": 24, "x": 0, - "y": 4 + "y": 8 }, "targets": [ { @@ -233,7 +293,7 @@ data: } }, { - "id": 5, + "id": 6, "type": "table", "title": "Top namespaces", "datasource": { @@ -244,7 +304,7 @@ data: "h": 9, "w": 12, "x": 0, - "y": 12 + "y": 16 }, "targets": [ { @@ -269,7 +329,7 @@ data: ] }, { - "id": 6, + "id": 7, "type": "table", "title": "Top pods", "datasource": { @@ -280,7 +340,7 @@ data: "h": 9, "w": 12, "x": 12, - "y": 12 + "y": 16 }, "targets": [ { @@ -305,7 +365,7 @@ data: ] }, { - "id": 7, + "id": 8, "type": "timeseries", "title": "Traefik routers (req/s)", "datasource": { @@ -316,7 +376,7 @@ data: "h": 9, "w": 12, "x": 0, - "y": 21 + "y": 25 }, "targets": [ { @@ -342,7 +402,7 @@ data: } }, { - "id": 8, + "id": 9, "type": "timeseries", "title": "Traefik entrypoints (req/s)", "datasource": { @@ -353,7 +413,7 @@ data: "h": 9, "w": 12, "x": 12, - "y": 21 + "y": 25 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 6503da9..d20a5a4 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -334,59 +334,6 @@ data: } ] }, - { - "id": 6, - "type": "gauge", - "title": "Running pods", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 20, - "y": 0 - }, - "targets": [ - { - "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "min": 0, - "max": 5, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 5 - } - ] - } - }, - "overrides": [] - }, - "options": { - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false - } - }, { "id": 7, "type": "stat", @@ -672,506 +619,7 @@ data: ] }, { - "id": 11, - "type": "piechart", - "title": "Namespace CPU share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 0, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 12, - "type": "piechart", - "title": "Namespace GPU share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 8, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( (sum(rate(container_accelerator_duty_cycle{namespace!=\"\",accelerator=\"nvidia.com/gpu\"}[5m])) by (namespace)) or on(namespace) (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 0) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 13, - "type": "piechart", - "title": "Namespace RAM share", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 10 - }, - "targets": [ - { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace) * 10)) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", - "refId": "A", - "legendFormat": "{{namespace}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 14, - "type": "timeseries", - "title": "Worker node CPU", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 19 - }, - "targets": [ - { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-nodes dashboard", - "url": "/d/atlas-nodes", - "targetBlank": true - } - ] - }, - { - "id": 15, - "type": "timeseries", - "title": "Worker node RAM", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 19 - }, - "targets": [ - { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-nodes dashboard", - "url": "/d/atlas-nodes", - "targetBlank": true - } - ] - }, - { - "id": 16, - "type": "timeseries", - "title": "Control plane CPU", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 27 - }, - "targets": [ - { - "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 17, - "type": "timeseries", - "title": "Control plane RAM", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 27 - }, - "targets": [ - { - "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 18, - "type": "timeseries", - "title": "Cluster ingress throughput", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", - "refId": "A", - "legendFormat": "Ingress" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-network dashboard", - "url": "/d/atlas-network", - "targetBlank": true - } - ] - }, - { - "id": 19, - "type": "timeseries", - "title": "Cluster egress throughput", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 34 - }, - "targets": [ - { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", - "refId": "A", - "legendFormat": "Egress" - } - ], - "fieldConfig": { - "defaults": { - "unit": "Bps" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "links": [ - { - "title": "Open atlas-network dashboard", - "url": "/d/atlas-network", - "targetBlank": true - } - ] - }, - { - "id": 20, - "type": "timeseries", - "title": "Root filesystem usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 41 - }, - "targets": [ - { - "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right", - "calcs": [ - "last" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "timeFrom": "30d", - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 21, - "type": "bargauge", - "title": "Nodes closest to full root disks", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 41 - }, - "targets": [ - { - "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", - "refId": "A", - "legendFormat": "{{node}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 70 - }, - { - "color": "red", - "value": 85 - } - ] - } - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 21, + "id": 23, "type": "stat", "title": "Astreae usage", "datasource": { @@ -1182,7 +630,7 @@ data: "h": 6, "w": 6, "x": 0, - "y": 49 + "y": 10 }, "targets": [ { @@ -1242,7 +690,7 @@ data: ] }, { - "id": 22, + "id": 24, "type": "stat", "title": "Asteria usage", "datasource": { @@ -1253,7 +701,7 @@ data: "h": 6, "w": 6, "x": 6, - "y": 49 + "y": 10 }, "targets": [ { @@ -1313,7 +761,7 @@ data: ] }, { - "id": 23, + "id": 25, "type": "stat", "title": "Astreae free", "datasource": { @@ -1324,7 +772,7 @@ data: "h": 6, "w": 6, "x": 12, - "y": 49 + "y": 10 }, "targets": [ { @@ -1380,7 +828,7 @@ data: ] }, { - "id": 24, + "id": 26, "type": "stat", "title": "Asteria free", "datasource": { @@ -1391,7 +839,7 @@ data: "h": 6, "w": 6, "x": 18, - "y": 49 + "y": 10 }, "targets": [ { @@ -1445,6 +893,588 @@ data: "targetBlank": true } ] + }, + { + "id": 11, + "type": "piechart", + "title": "Namespace CPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 16 + }, + "targets": [ + { + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 12, + "type": "piechart", + "title": "Namespace GPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 16 + }, + "targets": [ + { + "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 13, + "type": "piechart", + "title": "Namespace RAM share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 16 + }, + "targets": [ + { + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 14, + "type": "timeseries", + "title": "Worker node CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "targets": [ + { + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 15, + "type": "timeseries", + "title": "Worker node RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "targets": [ + { + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-nodes dashboard", + "url": "/d/atlas-nodes", + "targetBlank": true + } + ] + }, + { + "id": 16, + "type": "timeseries", + "title": "Control plane CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 33 + }, + "targets": [ + { + "expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 17, + "type": "timeseries", + "title": "Control plane RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 33 + }, + "targets": [ + { + "expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 18, + "type": "timeseries", + "title": "Cluster ingress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 40 + }, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "refId": "A", + "legendFormat": "Ingress (Traefik)" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 19, + "type": "timeseries", + "title": "Cluster egress throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 40 + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "refId": "A", + "legendFormat": "Egress (Traefik)" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 20, + "type": "timeseries", + "title": "Intra-cluster throughput", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 40 + }, + "targets": [ + { + "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "refId": "A", + "legendFormat": "Internal traffic" + } + ], + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "links": [ + { + "title": "Open atlas-network dashboard", + "url": "/d/atlas-network", + "targetBlank": true + } + ] + }, + { + "id": 21, + "type": "timeseries", + "title": "Root filesystem usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 47 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d", + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] + }, + { + "id": 22, + "type": "bargauge", + "title": "Nodes closest to full root disks", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 47 + }, + "targets": [ + { + "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ], + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] } ], "schemaVersion": 39, diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 76263c1..3164862 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -10,5 +10,6 @@ resources: - grafana-dashboard-nodes.yaml - grafana-dashboard-storage.yaml - grafana-dashboard-network.yaml + - dcgm-exporter.yaml - grafana-folders.yaml - helmrelease.yaml From 46410c9a9dff467c788112421df680b5b2c7b441 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 14:19:23 -0300 Subject: [PATCH 47/71] monitoring: fix dcgm image --- services/monitoring/dcgm-exporter.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index efd32c5..eaa3930 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -35,7 +35,7 @@ spec: - operator: Exists containers: - name: dcgm-exporter - image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-1 + image: docker.io/nvidia/dcgm-exporter:3.3.5-1-ubuntu22.04 imagePullPolicy: IfNotPresent ports: - name: metrics From 5a2575d54eff036ae3523180acdcc1acaab8269b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 14:33:24 -0300 Subject: [PATCH 48/71] flux: scope monitoring health checks --- .../platform/monitoring/kustomization.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml b/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml index 2899531..f684773 100644 --- a/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml @@ -12,3 +12,20 @@ spec: kind: GitRepository name: flux-system wait: true + healthChecks: + - apiVersion: helm.toolkit.fluxcd.io/v2 + kind: HelmRelease + name: grafana + namespace: monitoring + - apiVersion: helm.toolkit.fluxcd.io/v2 + kind: HelmRelease + name: victoria-metrics-single + namespace: monitoring + - apiVersion: helm.toolkit.fluxcd.io/v2 + kind: HelmRelease + name: node-exporter + namespace: monitoring + - apiVersion: helm.toolkit.fluxcd.io/v2 + kind: HelmRelease + name: alertmanager + namespace: monitoring From 909cb4ff26565da718099180ab784bd793d1a508 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 15:04:18 -0300 Subject: [PATCH 49/71] flux: disable wait for monitoring --- .../platform/monitoring/kustomization.yaml | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml b/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml index f684773..82ad672 100644 --- a/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml +++ b/clusters/atlas/flux-system/platform/monitoring/kustomization.yaml @@ -11,21 +11,4 @@ spec: sourceRef: kind: GitRepository name: flux-system - wait: true - healthChecks: - - apiVersion: helm.toolkit.fluxcd.io/v2 - kind: HelmRelease - name: grafana - namespace: monitoring - - apiVersion: helm.toolkit.fluxcd.io/v2 - kind: HelmRelease - name: victoria-metrics-single - namespace: monitoring - - apiVersion: helm.toolkit.fluxcd.io/v2 - kind: HelmRelease - name: node-exporter - namespace: monitoring - - apiVersion: helm.toolkit.fluxcd.io/v2 - kind: HelmRelease - name: alertmanager - namespace: monitoring + wait: false From 7b2a69cfe3c2fdcb08aa123dc4dcb3a6c90f0925 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 15:10:58 -0300 Subject: [PATCH 50/71] monitoring: disable dcgm exporter --- services/monitoring/kustomization.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 3164862..76263c1 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -10,6 +10,5 @@ resources: - grafana-dashboard-nodes.yaml - grafana-dashboard-storage.yaml - grafana-dashboard-network.yaml - - dcgm-exporter.yaml - grafana-folders.yaml - helmrelease.yaml From c7b7bc7a6db29567595878dd6d6786dd2a0a18bb Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 15:55:24 -0300 Subject: [PATCH 51/71] monitoring: adjust overview spacing and net panels --- scripts/render_dashboards.py | 32 +++++++++++++------ .../monitoring/dashboards/atlas-network.json | 2 +- .../monitoring/dashboards/atlas-overview.json | 32 +++++++++---------- .../monitoring/grafana-dashboard-network.yaml | 2 +- .../grafana-dashboard-overview.yaml | 32 +++++++++---------- 5 files changed, 56 insertions(+), 44 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 273090a..bf06d40 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -81,6 +81,7 @@ CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring" LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" +GAUGE_WIDTHS = [5, 5, 5, 5, 4] # --------------------------------------------------------------------------- # PromQL helpers @@ -262,13 +263,18 @@ TRAEFIK_NET_EGRESS = ( 'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' " or on() vector(0)" ) -NET_TOTAL_EXPR = ( +NET_CLUSTER_RX = ( + 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))' + " or on() vector(0)" +) +NET_CLUSTER_TX = ( 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))' " or on() vector(0)" ) +NET_TOTAL_EXPR = NET_CLUSTER_TX NET_INGRESS_EXPR = TRAEFIK_NET_INGRESS NET_EGRESS_EXPR = TRAEFIK_NET_EGRESS -NET_INTERNAL_EXPR = f"clamp_min(({NET_TOTAL_EXPR}) - ({TRAEFIK_NET_EGRESS}), 0)" +NET_INTERNAL_EXPR = f"clamp_min((({NET_CLUSTER_RX}) + ({NET_CLUSTER_TX})) - (({TRAEFIK_NET_INGRESS}) + ({TRAEFIK_NET_EGRESS})), 0)" # --------------------------------------------------------------------------- # Panel factories @@ -534,6 +540,11 @@ def build_overview(): link_to("atlas-pods"), ), ] + def gauge_grid(idx): + width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4 + x = sum(GAUGE_WIDTHS[:idx]) + return width, x + for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): thresholds = None min_value = 0 @@ -577,12 +588,13 @@ def build_overview(): {"color": "red", "value": max_value}, ], } + width, x = gauge_grid(idx) panels.append( gauge_panel( panel_id, title, expr, - {"h": 5, "w": 4, "x": 4 * idx, "y": 0}, + {"h": 5, "w": width, "x": x, "y": 0}, min_value=min_value, max_value=max_value, thresholds=thresholds, @@ -662,7 +674,7 @@ def build_overview(): 14, "Worker node CPU", node_cpu_expr(worker_filter), - {"h": 8, "w": 12, "x": 0, "y": 25}, + {"h": 8, "w": 12, "x": 0, "y": 32}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -676,7 +688,7 @@ def build_overview(): 15, "Worker node RAM", node_mem_expr(worker_filter), - {"h": 8, "w": 12, "x": 12, "y": 25}, + {"h": 8, "w": 12, "x": 12, "y": 32}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -691,7 +703,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_REGEX), - {"h": 7, "w": 12, "x": 0, "y": 33}, + {"h": 7, "w": 12, "x": 0, "y": 40}, unit="percent", legend="{{node}}", legend_display="table", @@ -703,7 +715,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_REGEX), - {"h": 7, "w": 12, "x": 12, "y": 33}, + {"h": 7, "w": 12, "x": 12, "y": 40}, unit="percent", legend="{{node}}", legend_display="table", @@ -716,7 +728,7 @@ def build_overview(): 18, "Cluster ingress throughput", NET_INGRESS_EXPR, - {"h": 7, "w": 8, "x": 0, "y": 40}, + {"h": 7, "w": 8, "x": 0, "y": 25}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", @@ -729,7 +741,7 @@ def build_overview(): 19, "Cluster egress throughput", NET_EGRESS_EXPR, - {"h": 7, "w": 8, "x": 8, "y": 40}, + {"h": 7, "w": 8, "x": 8, "y": 25}, unit="Bps", legend="Egress (Traefik)", legend_display="list", @@ -742,7 +754,7 @@ def build_overview(): 20, "Intra-cluster throughput", NET_INTERNAL_EXPR, - {"h": 7, "w": 8, "x": 16, "y": 40}, + {"h": 7, "w": 8, "x": 16, "y": 25}, unit="Bps", legend="Internal traffic", legend_display="list", diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 1baec3a..8a8b8f4 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -140,7 +140,7 @@ }, "targets": [ { - "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index eba6466..4cd4b29 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -17,7 +17,7 @@ }, "gridPos": { "h": 5, - "w": 4, + "w": 5, "x": 0, "y": 0 }, @@ -78,8 +78,8 @@ }, "gridPos": { "h": 5, - "w": 4, - "x": 4, + "w": 5, + "x": 5, "y": 0 }, "targets": [ @@ -131,8 +131,8 @@ }, "gridPos": { "h": 5, - "w": 4, - "x": 8, + "w": 5, + "x": 10, "y": 0 }, "targets": [ @@ -199,8 +199,8 @@ }, "gridPos": { "h": 5, - "w": 4, - "x": 12, + "w": 5, + "x": 15, "y": 0 }, "targets": [ @@ -268,7 +268,7 @@ "gridPos": { "h": 5, "w": 4, - "x": 16, + "x": 20, "y": 0 }, "targets": [ @@ -1056,7 +1056,7 @@ "h": 8, "w": 12, "x": 0, - "y": 25 + "y": 32 }, "targets": [ { @@ -1103,7 +1103,7 @@ "h": 8, "w": 12, "x": 12, - "y": 25 + "y": 32 }, "targets": [ { @@ -1150,7 +1150,7 @@ "h": 7, "w": 12, "x": 0, - "y": 33 + "y": 40 }, "targets": [ { @@ -1187,7 +1187,7 @@ "h": 7, "w": 12, "x": 12, - "y": 33 + "y": 40 }, "targets": [ { @@ -1224,7 +1224,7 @@ "h": 7, "w": 8, "x": 0, - "y": 40 + "y": 25 }, "targets": [ { @@ -1268,7 +1268,7 @@ "h": 7, "w": 8, "x": 8, - "y": 40 + "y": 25 }, "targets": [ { @@ -1312,11 +1312,11 @@ "h": 7, "w": 8, "x": 16, - "y": 40 + "y": 25 }, "targets": [ { - "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", "refId": "A", "legendFormat": "Internal traffic" } diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index ade7457..1727e6a 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -149,7 +149,7 @@ data: }, "targets": [ { - "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index d20a5a4..99d6d46 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -26,7 +26,7 @@ data: }, "gridPos": { "h": 5, - "w": 4, + "w": 5, "x": 0, "y": 0 }, @@ -87,8 +87,8 @@ data: }, "gridPos": { "h": 5, - "w": 4, - "x": 4, + "w": 5, + "x": 5, "y": 0 }, "targets": [ @@ -140,8 +140,8 @@ data: }, "gridPos": { "h": 5, - "w": 4, - "x": 8, + "w": 5, + "x": 10, "y": 0 }, "targets": [ @@ -208,8 +208,8 @@ data: }, "gridPos": { "h": 5, - "w": 4, - "x": 12, + "w": 5, + "x": 15, "y": 0 }, "targets": [ @@ -277,7 +277,7 @@ data: "gridPos": { "h": 5, "w": 4, - "x": 16, + "x": 20, "y": 0 }, "targets": [ @@ -1065,7 +1065,7 @@ data: "h": 8, "w": 12, "x": 0, - "y": 25 + "y": 32 }, "targets": [ { @@ -1112,7 +1112,7 @@ data: "h": 8, "w": 12, "x": 12, - "y": 25 + "y": 32 }, "targets": [ { @@ -1159,7 +1159,7 @@ data: "h": 7, "w": 12, "x": 0, - "y": 33 + "y": 40 }, "targets": [ { @@ -1196,7 +1196,7 @@ data: "h": 7, "w": 12, "x": 12, - "y": 33 + "y": 40 }, "targets": [ { @@ -1233,7 +1233,7 @@ data: "h": 7, "w": 8, "x": 0, - "y": 40 + "y": 25 }, "targets": [ { @@ -1277,7 +1277,7 @@ data: "h": 7, "w": 8, "x": 8, - "y": 40 + "y": 25 }, "targets": [ { @@ -1321,11 +1321,11 @@ data: "h": 7, "w": 8, "x": 16, - "y": 40 + "y": 25 }, "targets": [ { - "expr": "clamp_min((sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) - (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)), 0)", + "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", "refId": "A", "legendFormat": "Internal traffic" } From f06be37f44e53c3f80870fca822b421f61e86901 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 16:18:52 -0300 Subject: [PATCH 52/71] monitoring: refine network metrics and control-plane allowance --- scripts/render_dashboards.py | 21 ++++++++++++++----- .../monitoring/dashboards/atlas-network.json | 6 +++--- .../monitoring/dashboards/atlas-nodes.json | 2 +- .../monitoring/dashboards/atlas-overview.json | 8 +++---- .../monitoring/dashboards/atlas-pods.json | 2 +- .../monitoring/grafana-dashboard-network.yaml | 6 +++--- .../monitoring/grafana-dashboard-nodes.yaml | 2 +- .../grafana-dashboard-overview.yaml | 8 +++---- .../monitoring/grafana-dashboard-pods.yaml | 2 +- 9 files changed, 34 insertions(+), 23 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index bf06d40..33b388d 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -79,7 +79,7 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES) WORKER_TOTAL = len(WORKER_NODES) CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" -CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring" +CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system" LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" GAUGE_WIDTHS = [5, 5, 5, 5, 4] @@ -271,10 +271,21 @@ NET_CLUSTER_TX = ( 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))' " or on() vector(0)" ) -NET_TOTAL_EXPR = NET_CLUSTER_TX -NET_INGRESS_EXPR = TRAEFIK_NET_INGRESS -NET_EGRESS_EXPR = TRAEFIK_NET_EGRESS -NET_INTERNAL_EXPR = f"clamp_min((({NET_CLUSTER_RX}) + ({NET_CLUSTER_TX})) - (({TRAEFIK_NET_INGRESS}) + ({TRAEFIK_NET_EGRESS})), 0)" +PHYSICAL_NET_FILTER = 'device!~"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*"' +NET_NODE_RX_PHYS = ( + f'sum(rate(node_network_receive_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)' +) +NET_NODE_TX_PHYS = ( + f'sum(rate(node_network_transmit_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)' +) +NET_TOTAL_EXPR = NET_NODE_TX_PHYS +NET_INGRESS_EXPR = NET_NODE_RX_PHYS +NET_EGRESS_EXPR = NET_NODE_TX_PHYS +NET_INTERNAL_EXPR = ( + 'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!="",container!=""}[5m]) ' + '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!="",container!=""}[5m]))' + ' or on() vector(0)' +) # --------------------------------------------------------------------------- # Panel factories diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 8a8b8f4..ca671c8 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -80,7 +80,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -140,7 +140,7 @@ }, "targets": [ { - "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index e974d8a..3cf784f 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -142,7 +142,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 4cd4b29..156d96f 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -137,7 +137,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", "refId": "A" } ], @@ -1228,7 +1228,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Ingress (Traefik)" } @@ -1272,7 +1272,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Egress (Traefik)" } @@ -1316,7 +1316,7 @@ }, "targets": [ { - "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Internal traffic" } diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index 8494e89..f519d14 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -200,7 +200,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 1727e6a..fa5b742 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -89,7 +89,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A" } ], @@ -149,7 +149,7 @@ data: }, "targets": [ { - "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index afbeb3c..c78e994 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -151,7 +151,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 99d6d46..957bb6a 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -146,7 +146,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", "refId": "A" } ], @@ -1237,7 +1237,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Ingress (Traefik)" } @@ -1281,7 +1281,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Egress (Traefik)" } @@ -1325,7 +1325,7 @@ data: }, "targets": [ { - "expr": "clamp_min(((sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0))) - ((sum(rate(container_network_receive_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0)) + (sum(rate(container_network_transmit_bytes_total{namespace=\"traefik\",pod=~\"traefik-.*\"}[5m])) or on() vector(0))), 0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Internal traffic" } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index e160eca..78beca5 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -209,7 +209,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", "refId": "A" } ], From e4f93e85d25a57b8cb9521e9dcd80b34ca869e0b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 17:09:13 -0300 Subject: [PATCH 53/71] monitoring: control-plane stat and namespace share tweaks --- scripts/render_dashboards.py | 48 ++++++++++++------- .../monitoring/dashboards/atlas-network.json | 2 +- .../monitoring/dashboards/atlas-overview.json | 29 ++++++----- .../monitoring/grafana-dashboard-network.yaml | 2 +- .../grafana-dashboard-overview.yaml | 29 ++++++----- 5 files changed, 70 insertions(+), 40 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 33b388d..812a931 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -82,6 +82,9 @@ WORKER_SUFFIX = f"/{WORKER_TOTAL}" CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system" LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" GAUGE_WIDTHS = [5, 5, 5, 5, 4] +CONTROL_WORKLOADS_EXPR = ( + f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)' +) # --------------------------------------------------------------------------- # PromQL helpers @@ -168,7 +171,7 @@ def node_io_expr(scope=""): def namespace_share_expr(resource_expr): selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )" - total = f"clamp_min(sum( {resource_expr} ), 1)" + total = f"clamp_min(sum( {selected} ), 1)" return f"100 * ( {selected} ) / {total}" @@ -282,8 +285,8 @@ NET_TOTAL_EXPR = NET_NODE_TX_PHYS NET_INGRESS_EXPR = NET_NODE_RX_PHYS NET_EGRESS_EXPR = NET_NODE_TX_PHYS NET_INTERNAL_EXPR = ( - 'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!="",container!=""}[5m]) ' - '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!="",container!=""}[5m]))' + 'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!=""}[5m]) ' + '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))' ' or on() vector(0)' ) @@ -529,9 +532,9 @@ def build_overview(): ( 3, "Control plane workloads", - f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', + CONTROL_WORKLOADS_EXPR, None, - 1, + 4, link_to("atlas-pods"), ), ( @@ -600,18 +603,31 @@ def build_overview(): ], } width, x = gauge_grid(idx) - panels.append( - gauge_panel( - panel_id, - title, - expr, - {"h": 5, "w": width, "x": x, "y": 0}, - min_value=min_value, - max_value=max_value, - thresholds=thresholds, - links=links, + if panel_id == 3: + panels.append( + stat_panel( + panel_id, + title, + expr, + {"h": 5, "w": width, "x": x, "y": 0}, + thresholds=thresholds, + legend=None, + links=links, + ) + ) + else: + panels.append( + gauge_panel( + panel_id, + title, + expr, + {"h": 5, "w": width, "x": x, "y": 0}, + min_value=min_value, + max_value=max_value, + thresholds=thresholds, + links=links, + ) ) - ) hottest = [ (7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"), diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index ca671c8..9005eb9 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -140,7 +140,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 156d96f..93a246b 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -123,7 +123,7 @@ }, { "id": 3, - "type": "gauge", + "type": "stat", "title": "Control plane workloads", "datasource": { "type": "prometheus", @@ -137,14 +137,16 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { - "min": 0, - "max": 4, + "color": { + "mode": "palette-classic" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -165,11 +167,18 @@ "value": 3 } ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" } }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -177,9 +186,7 @@ "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "value" }, "links": [ { @@ -901,7 +908,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -954,7 +961,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ), 1)", + "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1007,7 +1014,7 @@ }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1316,7 +1323,7 @@ }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Internal traffic" } diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index fa5b742..d2372de 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -149,7 +149,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 957bb6a..ebd9b2b 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -132,7 +132,7 @@ data: }, { "id": 3, - "type": "gauge", + "type": "stat", "title": "Control plane workloads", "datasource": { "type": "prometheus", @@ -146,14 +146,16 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)", "refId": "A" } ], "fieldConfig": { "defaults": { - "min": 0, - "max": 4, + "color": { + "mode": "palette-classic" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -174,11 +176,18 @@ data: "value": 3 } ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" } }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -186,9 +195,7 @@ data: "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "value" }, "links": [ { @@ -910,7 +917,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ), 1)", + "expr": "100 * ( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -963,7 +970,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ), 1)", + "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1016,7 +1023,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ), 1)", + "expr": "100 * ( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1325,7 +1332,7 @@ data: }, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\",container!=\"\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"traefik\",pod!=\"\"}[5m])) or on() vector(0)", "refId": "A", "legendFormat": "Internal traffic" } From 630f1f2a810c02f847474a2ff62a5c5079981a8f Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 19:43:19 -0300 Subject: [PATCH 54/71] traefik: extend upload timeouts --- infrastructure/traefik/deployment.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/infrastructure/traefik/deployment.yaml b/infrastructure/traefik/deployment.yaml index ba16909..196954c 100644 --- a/infrastructure/traefik/deployment.yaml +++ b/infrastructure/traefik/deployment.yaml @@ -39,6 +39,14 @@ items: - --metrics.prometheus.addEntryPointsLabels=true - --metrics.prometheus.addRoutersLabels=true - --metrics.prometheus.addServicesLabels=true + - --entrypoints.web.forwardingTimeouts.dialTimeout=120s + - --entrypoints.web.forwardingTimeouts.responseHeaderTimeout=10m + - --entrypoints.web.transport.respondingTimeouts.readTimeout=0 + - --entrypoints.web.transport.respondingTimeouts.idleTimeout=0 + - --entrypoints.websecure.forwardingTimeouts.dialTimeout=120s + - --entrypoints.websecure.forwardingTimeouts.responseHeaderTimeout=10m + - --entrypoints.websecure.transport.respondingTimeouts.readTimeout=0 + - --entrypoints.websecure.transport.respondingTimeouts.idleTimeout=0 - --entrypoints.metrics.address=:9100 - --metrics.prometheus.entryPoint=metrics image: traefik:v3.3.3 From 75f6a593160ec50258ff92dd0f8532b5a3137608 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 18 Nov 2025 20:01:16 -0300 Subject: [PATCH 55/71] traefik: use responding timeouts only --- infrastructure/traefik/deployment.yaml | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/infrastructure/traefik/deployment.yaml b/infrastructure/traefik/deployment.yaml index 196954c..a34307a 100644 --- a/infrastructure/traefik/deployment.yaml +++ b/infrastructure/traefik/deployment.yaml @@ -39,14 +39,12 @@ items: - --metrics.prometheus.addEntryPointsLabels=true - --metrics.prometheus.addRoutersLabels=true - --metrics.prometheus.addServicesLabels=true - - --entrypoints.web.forwardingTimeouts.dialTimeout=120s - - --entrypoints.web.forwardingTimeouts.responseHeaderTimeout=10m - - --entrypoints.web.transport.respondingTimeouts.readTimeout=0 - - --entrypoints.web.transport.respondingTimeouts.idleTimeout=0 - - --entrypoints.websecure.forwardingTimeouts.dialTimeout=120s - - --entrypoints.websecure.forwardingTimeouts.responseHeaderTimeout=10m - - --entrypoints.websecure.transport.respondingTimeouts.readTimeout=0 - - --entrypoints.websecure.transport.respondingTimeouts.idleTimeout=0 + - --entrypoints.web.transport.respondingTimeouts.readTimeout=0s + - --entrypoints.web.transport.respondingTimeouts.writeTimeout=0s + - --entrypoints.web.transport.respondingTimeouts.idleTimeout=0s + - --entrypoints.websecure.transport.respondingTimeouts.readTimeout=0s + - --entrypoints.websecure.transport.respondingTimeouts.writeTimeout=0s + - --entrypoints.websecure.transport.respondingTimeouts.idleTimeout=0s - --entrypoints.metrics.address=:9100 - --metrics.prometheus.entryPoint=metrics image: traefik:v3.3.3 From d99bb06eeb9474d3522c2f72e0fb7bc20d8be86b Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 20 Nov 2025 13:11:13 -0300 Subject: [PATCH 56/71] monitoring: reenable dcgm exporter --- services/monitoring/dcgm-exporter.yaml | 2 +- services/monitoring/kustomization.yaml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index eaa3930..9a4a1d4 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -35,7 +35,7 @@ spec: - operator: Exists containers: - name: dcgm-exporter - image: docker.io/nvidia/dcgm-exporter:3.3.5-1-ubuntu22.04 + image: registry.bstein.dev/monitoring/dcgm:4.4.2-1-ubuntu22.04 imagePullPolicy: IfNotPresent ports: - name: metrics diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 76263c1..3164862 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -10,5 +10,6 @@ resources: - grafana-dashboard-nodes.yaml - grafana-dashboard-storage.yaml - grafana-dashboard-network.yaml + - dcgm-exporter.yaml - grafana-folders.yaml - helmrelease.yaml From 5b89b0533e96cf37f87591afd356f3da627b341a Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 11:54:53 -0300 Subject: [PATCH 57/71] monitoring: use mirrored dcgm-exporter tag --- services/monitoring/README.md | 12 ++++++++++++ services/monitoring/dcgm-exporter.yaml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/services/monitoring/README.md b/services/monitoring/README.md index 74baf08..0e8885a 100644 --- a/services/monitoring/README.md +++ b/services/monitoring/README.md @@ -13,3 +13,15 @@ kubectl create secret generic grafana-admin \ ``` Update the password whenever you rotate credentials. + +## DCGM exporter image + +The NVIDIA GPU metrics DaemonSet expects `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`, mirrored from `docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04`. Refresh it in Zot when bumping versions: + +```bash +skopeo copy \ + docker://docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 \ + docker://registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 +``` + +When finished mirroring from the control-plane, you can remove temporary tooling with `sudo apt-get purge -y skopeo && sudo apt-get autoremove -y` and clear `~/.config/containers/auth.json`. diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index 9a4a1d4..766cf7b 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -35,7 +35,7 @@ spec: - operator: Exists containers: - name: dcgm-exporter - image: registry.bstein.dev/monitoring/dcgm:4.4.2-1-ubuntu22.04 + image: registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 imagePullPolicy: IfNotPresent ports: - name: metrics From d87a1dbc473230e57d6fcf229121ad7c8b20b302 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 11:59:55 -0300 Subject: [PATCH 58/71] monitoring: allow dcgm rollout with unavailable node --- services/monitoring/dcgm-exporter.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index 766cf7b..53f4e28 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -10,6 +10,9 @@ spec: selector: matchLabels: app: dcgm-exporter + updateStrategy: + rollingUpdate: + maxUnavailable: 2 template: metadata: labels: From ee923df567e1e79986f1ccde2b03922b0927a542 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 12:07:11 -0300 Subject: [PATCH 59/71] monitoring: add registry pull secret for dcgm-exporter --- services/monitoring/dcgm-exporter.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index 53f4e28..f23d742 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -55,6 +55,8 @@ spec: volumeMounts: - name: pod-resources mountPath: /var/lib/kubelet/pod-resources + imagePullSecrets: + - name: zot-regcred volumes: - name: pod-resources hostPath: From a18c3e6f67acb19d9104b46ea21f1fd7283029a4 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 12:19:16 -0300 Subject: [PATCH 60/71] monitoring: always pull dcgm-exporter tag --- services/monitoring/dcgm-exporter.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index f23d742..3dd6f34 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -39,7 +39,7 @@ spec: containers: - name: dcgm-exporter image: registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: metrics containerPort: 9400 From e26ef44d1a6072d395a231c741a5d229e611a576 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 12:25:30 -0300 Subject: [PATCH 61/71] monitoring: run dcgm-exporter with nvidia runtime --- services/monitoring/dcgm-exporter.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index 3dd6f34..06152e7 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -22,6 +22,7 @@ spec: prometheus.io/port: "9400" spec: serviceAccountName: default + runtimeClassName: nvidia affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: From f1d0970aa05300f1704e0b415188c2f0abb9ee8c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 12:36:24 -0300 Subject: [PATCH 62/71] monitoring: mirror dcgm-exporter as multi-arch --- services/monitoring/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/services/monitoring/README.md b/services/monitoring/README.md index 0e8885a..835ae1d 100644 --- a/services/monitoring/README.md +++ b/services/monitoring/README.md @@ -20,6 +20,7 @@ The NVIDIA GPU metrics DaemonSet expects `registry.bstein.dev/monitoring/dcgm-ex ```bash skopeo copy \ + --all \ docker://docker.io/nvidia/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 \ docker://registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04 ``` From eed67b3db01c493bce6ba5aa6ef8e70d1ec4a041 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 13:16:00 -0300 Subject: [PATCH 63/71] monitoring: regen dashboards with gpu details --- AGENTS.md | 42 ++++ ...shboards.py => dashboards_render_atlas.py} | 177 ++++++++++++---- services/monitoring/dashboards/atlas-gpu.json | 184 +++++++++++++++++ .../monitoring/dashboards/atlas-network.json | 5 +- .../monitoring/dashboards/atlas-overview.json | 14 +- .../monitoring/grafana-dashboard-gpu.yaml | 193 ++++++++++++++++++ .../monitoring/grafana-dashboard-network.yaml | 5 +- .../grafana-dashboard-overview.yaml | 14 +- services/monitoring/helmrelease.yaml | 9 + services/monitoring/kustomization.yaml | 1 + 10 files changed, 584 insertions(+), 60 deletions(-) create mode 100644 AGENTS.md rename scripts/{render_dashboards.py => dashboards_render_atlas.py} (90%) create mode 100644 services/monitoring/dashboards/atlas-gpu.json create mode 100644 services/monitoring/grafana-dashboard-gpu.yaml diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..05838aa --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,42 @@ + + +Repository Guidelines + +## Project Structure & Module Organization +- `infrastructure/`: cluster-scoped building blocks (core, flux-system, traefik, longhorn). Add new platform features by mirroring this layout. +- `services/`: workload manifests per app (`services/gitea/`, etc.) with `kustomization.yaml` plus one file per kind; keep diffs small and focused. +- `dockerfiles/` hosts bespoke images, while `scripts/` stores operational Fish/Bash helpers—extend these directories instead of relying on ad-hoc commands. + +## Build, Test, and Development Commands +- `kustomize build services/` (or `kubectl kustomize ...`) renders manifests exactly as Flux will. +- `kubectl apply --server-side --dry-run=client -k services/` checks schema compatibility without touching the cluster. +- `flux reconcile kustomization --namespace flux-system --with-source` pulls the latest Git state after merges or hotfixes. +- `fish scripts/flux_hammer.fish --help` explains the recovery tool; read it before running against production workloads. + +## Coding Style & Naming Conventions +- YAML uses two-space indents; retain the leading path comment (e.g. `# services/gitea/deployment.yaml`) to speed code review. +- Keep resource names lowercase kebab-case, align labels/selectors, and mirror namespaces with directory names. +- List resources in `kustomization.yaml` from namespace/config, through storage, then workloads and networking for predictable diffs. +- Scripts start with `#!/usr/bin/env fish` or bash, stay executable, and follow snake_case names such as `flux_hammer.fish`. + +## Testing Guidelines +- Run `kustomize build` and the dry-run apply for every service you touch; capture failures before opening a PR. +- `flux diff kustomization --path services/` previews reconciliations—link notable output when behavior shifts. +- Docker edits: `docker build -f dockerfiles/Dockerfile.monerod .` (swap the file you changed) to verify image builds. + +## Commit & Pull Request Guidelines +- Keep commit subjects short, present-tense, and optionally scoped (`gpu(titan-24): add RuntimeClass`); squash fixups before review. +- Describe linked issues, affected services, and required operator steps (e.g. `flux reconcile kustomization services-gitea`) in the PR body. +- Focus each PR on one kustomization or service and update `infrastructure/flux-system` when Flux must track new folders. +- Record the validation you ran (dry-runs, diffs, builds) and add screenshots only when ingress or UI behavior changes. + +## Security & Configuration Tips +- Never commit credentials; use Vault workflows (`services/vault/`) or SOPS-encrypted manifests wired through `infrastructure/flux-system`. +- Node selectors and tolerations gate workloads to hardware like `hardware: rpi4`; confirm labels before scaling or renaming nodes. +- Pin external images by digest or rely on Flux image automation to follow approved tags and avoid drift. + +## Dashboard roadmap / context (2025-12-02) +- Atlas dashboards are generated via `scripts/dashboards_render_atlas.py --build`, which writes JSON under `services/monitoring/dashboards/` and ConfigMaps under `services/monitoring/`. Keep the Grafana manifests in sync by regenerating after edits. +- Atlas Overview panels are paired with internal dashboards (pods, nodes, storage, network, GPU). A new `atlas-gpu` internal dashboard holds the detailed GPU metrics that feed the overview share pie. +- Old Grafana folders (`Atlas Storage`, `Atlas SRE`, `Atlas Public`, `Atlas Nodes`) should be removed in Grafana UI when convenient; only `Atlas Overview` and `Atlas Internal` should remain provisioned. +- Future work: add a separate generator (e.g., `dashboards_render_oceanus.py`) for SUI/oceanus validation dashboards, mirroring the atlas pattern of internal dashboards feeding a public overview. diff --git a/scripts/render_dashboards.py b/scripts/dashboards_render_atlas.py similarity index 90% rename from scripts/render_dashboards.py rename to scripts/dashboards_render_atlas.py index 812a931..97070d2 100644 --- a/scripts/render_dashboards.py +++ b/scripts/dashboards_render_atlas.py @@ -2,8 +2,8 @@ """Generate Atlas Grafana dashboards and render them into ConfigMaps. Usage: - scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps - scripts/render_dashboards.py # re-render ConfigMaps from JSON + scripts/dashboards_render_atlas.py --build # rebuild JSON + ConfigMaps + scripts/dashboards_render_atlas.py # re-render ConfigMaps from JSON """ import argparse @@ -198,7 +198,6 @@ STUCK_TERMINATING_EXPR = ( ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)' '))' ) - PROBLEM_TABLE_EXPR = ( "(time() - kube_pod_created{pod!=\"\"}) " "* on(namespace,pod) group_left(node) kube_pod_info " @@ -489,6 +488,47 @@ def pie_panel(panel_id, title, expr, grid): } +def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None): + """Return a bar gauge panel with label-aware reduction.""" + panel = { + "id": panel_id, + "type": "bargauge", + "title": title, + "datasource": PROM_DS, + "gridPos": grid, + "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}], + "fieldConfig": { + "defaults": { + "unit": unit, + "min": 0, + "max": 100 if unit == "percent" else None, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 50}, + {"color": "orange", "value": 70}, + {"color": "red", "value": 85}, + ], + }, + }, + "overrides": [], + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "/.*/", + "values": False, + }, + }, + } + if links: + panel["links"] = links + return panel + + def text_panel(panel_id, title, content, grid): return { "id": panel_id, @@ -554,6 +594,7 @@ def build_overview(): link_to("atlas-pods"), ), ] + def gauge_grid(idx): width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4 x = sum(GAUGE_WIDTHS[:idx]) @@ -806,38 +847,14 @@ def build_overview(): ) ) panels.append( - { - "id": 22, - "type": "bargauge", - "title": "Nodes closest to full root disks", - "datasource": PROM_DS, - "gridPos": {"h": 8, "w": 12, "x": 12, "y": 47}, - "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "thresholds": { - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 50}, - {"color": "orange", "value": 70}, - {"color": "red", "value": 85}, - ], - }, - }, - "overrides": [], - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, - }, - "links": link_to("atlas-storage"), - "transformations": [{"id": "labelsToFields", "options": {}}], - } + bargauge_panel( + 22, + "Nodes closest to full root disks", + f"topk(8, {root_usage_expr()})", + {"h": 8, "w": 12, "x": 12, "y": 47}, + unit="percent", + links=link_to("atlas-storage"), + ) ) return { @@ -857,6 +874,7 @@ def build_overview(): {"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False}, {"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False}, {"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False}, + {"title": "Atlas GPU", "type": "dashboard", "dashboardUid": "atlas-gpu", "keepTime": False}, ], } @@ -1179,13 +1197,31 @@ def build_storage_dashboard(): def build_network_dashboard(): panels = [] panels.append( - stat_panel(1, "Ingress traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="Bps") + stat_panel( + 1, + "Ingress traffic", + NET_INGRESS_EXPR, + {"h": 4, "w": 8, "x": 0, "y": 0}, + unit="Bps", + ) ) panels.append( - stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps") + stat_panel( + 2, + "Egress traffic", + NET_EGRESS_EXPR, + {"h": 4, "w": 8, "x": 8, "y": 0}, + unit="Bps", + ) ) panels.append( - stat_panel(3, "Intra-cluster traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 0}, unit="Bps") + stat_panel( + 3, + "Intra-cluster traffic", + NET_INTERNAL_EXPR, + {"h": 4, "w": 8, "x": 16, "y": 0}, + unit="Bps", + ) ) panels.append( stat_panel( @@ -1195,14 +1231,13 @@ def build_network_dashboard(): {"h": 4, "w": 8, "x": 0, "y": 4}, unit="req/s", legend="{{router}}", - instant=True, ) ) panels.append( timeseries_panel( 5, "Per-node throughput", - node_net_expr(), + f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})', {"h": 8, "w": 24, "x": 0, "y": 8}, unit="Bps", legend="{{node}}", @@ -1270,6 +1305,64 @@ def build_network_dashboard(): } +def build_gpu_dashboard(): + panels = [] + panels.append( + pie_panel( + 1, + "Namespace GPU share", + namespace_gpu_share_expr(), + {"h": 8, "w": 12, "x": 0, "y": 0}, + ) + ) + panels.append( + timeseries_panel( + 2, + "GPU util by namespace", + NAMESPACE_GPU_USAGE, + {"h": 8, "w": 12, "x": 12, "y": 0}, + unit="percent", + legend="{{namespace}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 3, + "GPU util by node", + 'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})', + {"h": 8, "w": 12, "x": 0, "y": 8}, + unit="percent", + legend="{{Hostname}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + table_panel( + 4, + "Top pods by GPU util", + 'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))', + {"h": 8, "w": 12, "x": 12, "y": 8}, + unit="percent", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + return { + "uid": "atlas-gpu", + "title": "Atlas GPU", + "folderUid": PRIVATE_FOLDER, + "editable": True, + "panels": panels, + "time": {"from": "now-12h", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "gpu"], + } + + DASHBOARDS = { "atlas-overview": { "builder": build_overview, @@ -1291,6 +1384,10 @@ DASHBOARDS = { "builder": build_network_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml", }, + "atlas-gpu": { + "builder": build_gpu_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml", + }, } diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json new file mode 100644 index 0000000..da235a5 --- /dev/null +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -0,0 +1,184 @@ +{ + "uid": "atlas-gpu", + "title": "Atlas GPU", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "piechart", + "title": "Namespace GPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 2, + "type": "timeseries", + "title": "GPU util by namespace", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "timeseries", + "title": "GPU util by node", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "targets": [ + { + "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", + "refId": "A", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 4, + "type": "table", + "title": "Top pods by GPU util", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "targets": [ + { + "expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "gpu" + ] +} diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 9005eb9..f2291b7 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -202,8 +202,7 @@ { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", - "legendFormat": "{{router}}", - "instant": true + "legendFormat": "{{router}}" } ], "fieldConfig": { @@ -262,7 +261,7 @@ }, "targets": [ { - "expr": "avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 93a246b..4e3c357 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1456,7 +1456,7 @@ "calcs": [ "lastNotNull" ], - "fields": "", + "fields": "/.*/", "values": false } }, @@ -1466,12 +1466,6 @@ "url": "/d/atlas-storage", "targetBlank": true } - ], - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } ] } ], @@ -1512,6 +1506,12 @@ "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": false + }, + { + "title": "Atlas GPU", + "type": "dashboard", + "dashboardUid": "atlas-gpu", + "keepTime": false } ] } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml new file mode 100644 index 0000000..13262d6 --- /dev/null +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -0,0 +1,193 @@ +# services/monitoring/grafana-dashboard-gpu.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-gpu + labels: + grafana_dashboard: "1" +data: + atlas-gpu.json: | + { + "uid": "atlas-gpu", + "title": "Atlas GPU", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "piechart", + "title": "Namespace GPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 2, + "type": "timeseries", + "title": "GPU util by namespace", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "timeseries", + "title": "GPU util by node", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "targets": [ + { + "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", + "refId": "A", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 4, + "type": "table", + "title": "Top pods by GPU util", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "targets": [ + { + "expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "gpu" + ] + } diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index d2372de..4b78fb9 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -211,8 +211,7 @@ data: { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", - "legendFormat": "{{router}}", - "instant": true + "legendFormat": "{{router}}" } ], "fieldConfig": { @@ -271,7 +270,7 @@ data: }, "targets": [ { - "expr": "avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index ebd9b2b..512adf9 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1465,7 +1465,7 @@ data: "calcs": [ "lastNotNull" ], - "fields": "", + "fields": "/.*/", "values": false } }, @@ -1475,12 +1475,6 @@ data: "url": "/d/atlas-storage", "targetBlank": true } - ], - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } ] } ], @@ -1521,6 +1515,12 @@ data: "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": false + }, + { + "title": "Atlas GPU", + "type": "dashboard", + "dashboardUid": "atlas-gpu", + "keepTime": false } ] } diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 5a8f1ba..cf56b27 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -320,6 +320,14 @@ spec: editable: true options: path: /var/lib/grafana/dashboards/storage + - name: gpu + orgId: 1 + folder: Atlas Internal + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/gpu - name: network orgId: 1 folder: Atlas Internal @@ -333,6 +341,7 @@ spec: pods: grafana-dashboard-pods nodes: grafana-dashboard-nodes storage: grafana-dashboard-storage + gpu: grafana-dashboard-gpu network: grafana-dashboard-network extraConfigmapMounts: - name: grafana-folders diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 3164862..a50a1c1 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -10,6 +10,7 @@ resources: - grafana-dashboard-nodes.yaml - grafana-dashboard-storage.yaml - grafana-dashboard-network.yaml + - grafana-dashboard-gpu.yaml - dcgm-exporter.yaml - grafana-folders.yaml - helmrelease.yaml From a3dc9391eef5011902e410cab32246ee9d354ca9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 14:41:39 -0300 Subject: [PATCH 64/71] monitoring: polish dashboards and folders --- scripts/dashboards_render_atlas.py | 119 +++++++++--------- services/monitoring/dashboards/atlas-gpu.json | 10 +- .../monitoring/dashboards/atlas-network.json | 18 +-- .../monitoring/dashboards/atlas-nodes.json | 12 +- .../monitoring/dashboards/atlas-overview.json | 85 +++++++------ .../monitoring/dashboards/atlas-pods.json | 8 +- .../monitoring/dashboards/atlas-storage.json | 16 +-- .../monitoring/grafana-dashboard-gpu.yaml | 10 +- .../monitoring/grafana-dashboard-network.yaml | 18 +-- .../monitoring/grafana-dashboard-nodes.yaml | 12 +- .../grafana-dashboard-overview.yaml | 85 +++++++------ .../monitoring/grafana-dashboard-pods.yaml | 8 +- .../monitoring/grafana-dashboard-storage.yaml | 16 +-- services/monitoring/grafana-folders.yaml | 11 +- services/monitoring/helmrelease.yaml | 4 +- 15 files changed, 238 insertions(+), 194 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 97070d2..11bd2c8 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -32,7 +32,7 @@ data: ) PROM_DS = {"type": "prometheus", "uid": "atlas-vm"} -PUBLIC_FOLDER = "atlas-overview" +PUBLIC_FOLDER = "overview" PRIVATE_FOLDER = "atlas-internal" PERCENT_THRESHOLDS = { @@ -231,10 +231,13 @@ NAMESPACE_GPU_ALLOC = ( 'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}' ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)' ) -NAMESPACE_GPU_USAGE = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)' +NAMESPACE_GPU_USAGE_SHARE = ( + 'avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)' +) +NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)' NAMESPACE_GPU_RAW = ( "(" - + NAMESPACE_GPU_USAGE + + NAMESPACE_GPU_USAGE_SHARE + ") or on(namespace) (" + NAMESPACE_CPU_RAW + " * 0)" @@ -519,7 +522,7 @@ def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None): "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"], - "fields": "/.*/", + "fields": "Value", "values": False, }, }, @@ -555,7 +558,7 @@ def build_overview(): row1_stats = [ ( 1, - "Workers ready", + "Workers Ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', WORKER_SUFFIX, WORKER_TOTAL, @@ -563,7 +566,7 @@ def build_overview(): ), ( 2, - "Control plane ready", + "Control Plane Ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', CONTROL_SUFFIX, CONTROL_TOTAL, @@ -571,7 +574,7 @@ def build_overview(): ), ( 3, - "Control plane workloads", + "Control Plane Workloads", CONTROL_WORKLOADS_EXPR, None, 4, @@ -579,7 +582,7 @@ def build_overview(): ), ( 4, - "Problem pods", + "Problem Pods", PROBLEM_PODS_EXPR, None, 1, @@ -587,7 +590,7 @@ def build_overview(): ), ( 5, - "Stuck terminating", + "Stuck Terminating", STUCK_TERMINATING_EXPR, None, 1, @@ -644,7 +647,7 @@ def build_overview(): ], } width, x = gauge_grid(idx) - if panel_id == 3: + if panel_id in (3, 4, 5): panels.append( stat_panel( panel_id, @@ -654,6 +657,7 @@ def build_overview(): thresholds=thresholds, legend=None, links=links, + text_mode="value", ) ) else: @@ -693,10 +697,10 @@ def build_overview(): ) storage_panels = [ - (23, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), - (24, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), - (25, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), - (26, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), + (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), + (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"), + (25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"), + (26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"), ] for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): panels.append( @@ -714,7 +718,7 @@ def build_overview(): panels.append( pie_panel( 11, - "Namespace CPU share", + "Namespace CPU Share", namespace_cpu_share_expr(), {"h": 9, "w": 8, "x": 0, "y": 16}, ) @@ -722,7 +726,7 @@ def build_overview(): panels.append( pie_panel( 12, - "Namespace GPU share", + "Namespace GPU Share", namespace_gpu_share_expr(), {"h": 9, "w": 8, "x": 8, "y": 16}, ) @@ -730,7 +734,7 @@ def build_overview(): panels.append( pie_panel( 13, - "Namespace RAM share", + "Namespace RAM Share", namespace_ram_share_expr(), {"h": 9, "w": 8, "x": 16, "y": 16}, ) @@ -740,7 +744,7 @@ def build_overview(): panels.append( timeseries_panel( 14, - "Worker node CPU", + "Worker Node CPU", node_cpu_expr(worker_filter), {"h": 8, "w": 12, "x": 0, "y": 32}, unit="percent", @@ -754,7 +758,7 @@ def build_overview(): panels.append( timeseries_panel( 15, - "Worker node RAM", + "Worker Node RAM", node_mem_expr(worker_filter), {"h": 8, "w": 12, "x": 12, "y": 32}, unit="percent", @@ -794,7 +798,7 @@ def build_overview(): panels.append( timeseries_panel( 18, - "Cluster ingress throughput", + "Cluster Ingress Throughput", NET_INGRESS_EXPR, {"h": 7, "w": 8, "x": 0, "y": 25}, unit="Bps", @@ -807,7 +811,7 @@ def build_overview(): panels.append( timeseries_panel( 19, - "Cluster egress throughput", + "Cluster Egress Throughput", NET_EGRESS_EXPR, {"h": 7, "w": 8, "x": 8, "y": 25}, unit="Bps", @@ -820,7 +824,7 @@ def build_overview(): panels.append( timeseries_panel( 20, - "Intra-cluster throughput", + "Intra-Cluster Throughput", NET_INTERNAL_EXPR, {"h": 7, "w": 8, "x": 16, "y": 25}, unit="Bps", @@ -834,7 +838,7 @@ def build_overview(): panels.append( timeseries_panel( 21, - "Root filesystem usage", + "Root Filesystem Usage", root_usage_expr(), {"h": 8, "w": 12, "x": 0, "y": 47}, unit="percent", @@ -849,7 +853,7 @@ def build_overview(): panels.append( bargauge_panel( 22, - "Nodes closest to full root disks", + "Nodes Closest to Full Root Disks", f"topk(8, {root_usage_expr()})", {"h": 8, "w": 12, "x": 12, "y": 47}, unit="percent", @@ -868,7 +872,8 @@ def build_overview(): "style": "dark", "tags": ["atlas", "overview"], "templating": {"list": []}, - "time": {"from": "now-12h", "to": "now"}, + "time": {"from": "now-1h", "to": "now"}, + "refresh": "1m", "links": [ {"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False}, {"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False}, @@ -884,7 +889,7 @@ def build_pods_dashboard(): panels.append( stat_panel( 1, - "Problem pods", + "Problem Pods", PROBLEM_PODS_EXPR, {"h": 4, "w": 6, "x": 0, "y": 0}, thresholds={ @@ -914,7 +919,7 @@ def build_pods_dashboard(): panels.append( stat_panel( 3, - "Stuck terminating (>10m)", + "Stuck Terminating (>10m)", STUCK_TERMINATING_EXPR, {"h": 4, "w": 6, "x": 12, "y": 0}, thresholds={ @@ -929,7 +934,7 @@ def build_pods_dashboard(): panels.append( stat_panel( 4, - "Control plane workloads", + "Control Plane Workloads", f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', {"h": 4, "w": 6, "x": 18, "y": 0}, thresholds={ @@ -945,7 +950,7 @@ def build_pods_dashboard(): panels.append( table_panel( 5, - "Pods not running", + "Pods Not Running", PROBLEM_TABLE_EXPR, {"h": 10, "w": 24, "x": 0, "y": 4}, unit="s", @@ -994,7 +999,7 @@ def build_nodes_dashboard(): panels.append( stat_panel( 1, - "Worker nodes ready", + "Worker Nodes Ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', {"h": 4, "w": 8, "x": 0, "y": 0}, value_suffix=WORKER_SUFFIX, @@ -1003,7 +1008,7 @@ def build_nodes_dashboard(): panels.append( stat_panel( 2, - "Control plane ready", + "Control Plane Ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', {"h": 4, "w": 8, "x": 8, "y": 0}, value_suffix=CONTROL_SUFFIX, @@ -1012,7 +1017,7 @@ def build_nodes_dashboard(): panels.append( stat_panel( 3, - "Control plane workloads", + "Control Plane Workloads", f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', {"h": 4, "w": 8, "x": 16, "y": 0}, ) @@ -1046,7 +1051,7 @@ def build_nodes_dashboard(): panels.append( timeseries_panel( 6, - "Control plane (incl. titan-db) CPU", + "Control Plane (incl. titan-db) CPU", node_cpu_expr(CONTROL_ALL_REGEX), {"h": 9, "w": 12, "x": 0, "y": 22}, unit="percent", @@ -1058,7 +1063,7 @@ def build_nodes_dashboard(): panels.append( timeseries_panel( 7, - "Control plane (incl. titan-db) RAM", + "Control Plane (incl. titan-db) RAM", node_mem_expr(CONTROL_ALL_REGEX), {"h": 9, "w": 12, "x": 12, "y": 22}, unit="percent", @@ -1070,7 +1075,7 @@ def build_nodes_dashboard(): panels.append( timeseries_panel( 8, - "Root filesystem usage", + "Root Filesystem Usage", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 31}, unit="percent", @@ -1099,7 +1104,7 @@ def build_storage_dashboard(): panels.append( stat_panel( 1, - "Astreae usage", + "Astreae Usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent", @@ -1109,7 +1114,7 @@ def build_storage_dashboard(): panels.append( stat_panel( 2, - "Asteria usage", + "Asteria Usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent", @@ -1119,7 +1124,7 @@ def build_storage_dashboard(): panels.append( stat_panel( 3, - "Astreae free", + "Astreae Free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="decbytes", @@ -1128,7 +1133,7 @@ def build_storage_dashboard(): panels.append( stat_panel( 4, - "Asteria free", + "Asteria Free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="decbytes", @@ -1137,7 +1142,7 @@ def build_storage_dashboard(): panels.append( timeseries_panel( 5, - "Astreae per-node usage", + "Astreae Per-Node Usage", filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 0, "y": 5}, unit="percent", @@ -1150,7 +1155,7 @@ def build_storage_dashboard(): panels.append( timeseries_panel( 6, - "Asteria per-node usage", + "Asteria Per-Node Usage", filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 12, "y": 5}, unit="percent", @@ -1163,7 +1168,7 @@ def build_storage_dashboard(): panels.append( timeseries_panel( 7, - "Astreae usage history", + "Astreae Usage History", astreae_usage_expr("/mnt/astreae"), {"h": 9, "w": 12, "x": 0, "y": 14}, unit="percent", @@ -1173,7 +1178,7 @@ def build_storage_dashboard(): panels.append( timeseries_panel( 8, - "Asteria usage history", + "Asteria Usage History", astreae_usage_expr("/mnt/asteria"), {"h": 9, "w": 12, "x": 12, "y": 14}, unit="percent", @@ -1199,7 +1204,7 @@ def build_network_dashboard(): panels.append( stat_panel( 1, - "Ingress traffic", + "Ingress Traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="Bps", @@ -1208,7 +1213,7 @@ def build_network_dashboard(): panels.append( stat_panel( 2, - "Egress traffic", + "Egress Traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps", @@ -1217,7 +1222,7 @@ def build_network_dashboard(): panels.append( stat_panel( 3, - "Intra-cluster traffic", + "Intra-Cluster Traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 0}, unit="Bps", @@ -1226,7 +1231,7 @@ def build_network_dashboard(): panels.append( stat_panel( 4, - "Top router req/s", + "Top Router req/s", f"topk(1, {TRAEFIK_ROUTER_EXPR})", {"h": 4, "w": 8, "x": 0, "y": 4}, unit="req/s", @@ -1236,7 +1241,7 @@ def build_network_dashboard(): panels.append( timeseries_panel( 5, - "Per-node throughput", + "Per-Node Throughput", f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})', {"h": 8, "w": 24, "x": 0, "y": 8}, unit="Bps", @@ -1248,7 +1253,7 @@ def build_network_dashboard(): panels.append( table_panel( 6, - "Top namespaces", + "Top Namespaces", 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 16}, @@ -1259,7 +1264,7 @@ def build_network_dashboard(): panels.append( table_panel( 7, - "Top pods", + "Top Pods", 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', {"h": 9, "w": 12, "x": 12, "y": 16}, @@ -1270,7 +1275,7 @@ def build_network_dashboard(): panels.append( timeseries_panel( 8, - "Traefik routers (req/s)", + "Traefik Routers (req/s)", f"topk(10, {TRAEFIK_ROUTER_EXPR})", {"h": 9, "w": 12, "x": 0, "y": 25}, unit="req/s", @@ -1282,7 +1287,7 @@ def build_network_dashboard(): panels.append( timeseries_panel( 9, - "Traefik entrypoints (req/s)", + "Traefik Entrypoints (req/s)", 'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))', {"h": 9, "w": 12, "x": 12, "y": 25}, unit="req/s", @@ -1310,7 +1315,7 @@ def build_gpu_dashboard(): panels.append( pie_panel( 1, - "Namespace GPU share", + "Namespace GPU Share", namespace_gpu_share_expr(), {"h": 8, "w": 12, "x": 0, "y": 0}, ) @@ -1318,8 +1323,8 @@ def build_gpu_dashboard(): panels.append( timeseries_panel( 2, - "GPU util by namespace", - NAMESPACE_GPU_USAGE, + "GPU Util by Namespace", + NAMESPACE_GPU_USAGE_INSTANT, {"h": 8, "w": 12, "x": 12, "y": 0}, unit="percent", legend="{{namespace}}", @@ -1330,7 +1335,7 @@ def build_gpu_dashboard(): panels.append( timeseries_panel( 3, - "GPU util by node", + "GPU Util by Node", 'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})', {"h": 8, "w": 12, "x": 0, "y": 8}, unit="percent", @@ -1342,7 +1347,7 @@ def build_gpu_dashboard(): panels.append( table_panel( 4, - "Top pods by GPU util", + "Top Pods by GPU Util", 'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))', {"h": 8, "w": 12, "x": 12, "y": 8}, unit="percent", diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index da235a5..8c1367b 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -7,7 +7,7 @@ { "id": 1, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -60,7 +60,7 @@ { "id": 2, "type": "timeseries", - "title": "GPU util by namespace", + "title": "GPU Util by Namespace", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -97,7 +97,7 @@ { "id": 3, "type": "timeseries", - "title": "GPU util by node", + "title": "GPU Util by Node", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -134,7 +134,7 @@ { "id": 4, "type": "table", - "title": "Top pods by GPU util", + "title": "Top Pods by GPU Util", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index f2291b7..ff0af9b 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -7,7 +7,7 @@ { "id": 1, "type": "stat", - "title": "Ingress traffic", + "title": "Ingress Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -67,7 +67,7 @@ { "id": 2, "type": "stat", - "title": "Egress traffic", + "title": "Egress Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -127,7 +127,7 @@ { "id": 3, "type": "stat", - "title": "Intra-cluster traffic", + "title": "Intra-Cluster Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -187,7 +187,7 @@ { "id": 4, "type": "stat", - "title": "Top router req/s", + "title": "Top Router req/s", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -248,7 +248,7 @@ { "id": 5, "type": "timeseries", - "title": "Per-node throughput", + "title": "Per-Node Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -285,7 +285,7 @@ { "id": 6, "type": "table", - "title": "Top namespaces", + "title": "Top Namespaces", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -321,7 +321,7 @@ { "id": 7, "type": "table", - "title": "Top pods", + "title": "Top Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -357,7 +357,7 @@ { "id": 8, "type": "timeseries", - "title": "Traefik routers (req/s)", + "title": "Traefik Routers (req/s)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -394,7 +394,7 @@ { "id": 9, "type": "timeseries", - "title": "Traefik entrypoints (req/s)", + "title": "Traefik Entrypoints (req/s)", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index 3cf784f..802fe5a 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -7,7 +7,7 @@ { "id": 1, "type": "stat", - "title": "Worker nodes ready", + "title": "Worker Nodes Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -68,7 +68,7 @@ { "id": 2, "type": "stat", - "title": "Control plane ready", + "title": "Control Plane Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -129,7 +129,7 @@ { "id": 3, "type": "stat", - "title": "Control plane workloads", + "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -269,7 +269,7 @@ { "id": 6, "type": "timeseries", - "title": "Control plane (incl. titan-db) CPU", + "title": "Control Plane (incl. titan-db) CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -306,7 +306,7 @@ { "id": 7, "type": "timeseries", - "title": "Control plane (incl. titan-db) RAM", + "title": "Control Plane (incl. titan-db) RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -343,7 +343,7 @@ { "id": 8, "type": "timeseries", - "title": "Root filesystem usage", + "title": "Root Filesystem Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 4e3c357..b556594 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1,7 +1,7 @@ { "uid": "atlas-overview", "title": "Atlas Overview", - "folderUid": "atlas-overview", + "folderUid": "overview", "editable": false, "annotations": { "list": [] @@ -10,7 +10,7 @@ { "id": 1, "type": "gauge", - "title": "Workers ready", + "title": "Workers Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -71,7 +71,7 @@ { "id": 2, "type": "gauge", - "title": "Control plane ready", + "title": "Control Plane Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -124,7 +124,7 @@ { "id": 3, "type": "stat", - "title": "Control plane workloads", + "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -198,8 +198,8 @@ }, { "id": 4, - "type": "gauge", - "title": "Problem pods", + "type": "stat", + "title": "Problem Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -218,8 +218,10 @@ ], "fieldConfig": { "defaults": { - "min": 0, - "max": 4, + "color": { + "mode": "palette-classic" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -240,11 +242,18 @@ "value": 3 } ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" } }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -252,9 +261,7 @@ "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "value" }, "links": [ { @@ -266,8 +273,8 @@ }, { "id": 5, - "type": "gauge", - "title": "Stuck terminating", + "type": "stat", + "title": "Stuck Terminating", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -286,8 +293,10 @@ ], "fieldConfig": { "defaults": { - "min": 0, - "max": 4, + "color": { + "mode": "palette-classic" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -308,11 +317,18 @@ "value": 3 } ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" } }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -320,9 +336,7 @@ "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "value" }, "links": [ { @@ -619,7 +633,7 @@ { "id": 23, "type": "stat", - "title": "Astreae usage", + "title": "Astreae Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -690,7 +704,7 @@ { "id": 24, "type": "stat", - "title": "Asteria usage", + "title": "Asteria Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -761,7 +775,7 @@ { "id": 25, "type": "stat", - "title": "Astreae free", + "title": "Astreae Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -828,7 +842,7 @@ { "id": 26, "type": "stat", - "title": "Asteria free", + "title": "Asteria Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -895,7 +909,7 @@ { "id": 11, "type": "piechart", - "title": "Namespace CPU share", + "title": "Namespace CPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -948,7 +962,7 @@ { "id": 12, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -961,7 +975,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1001,7 +1015,7 @@ { "id": 13, "type": "piechart", - "title": "Namespace RAM share", + "title": "Namespace RAM Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1054,7 +1068,7 @@ { "id": 14, "type": "timeseries", - "title": "Worker node CPU", + "title": "Worker Node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1101,7 +1115,7 @@ { "id": 15, "type": "timeseries", - "title": "Worker node RAM", + "title": "Worker Node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1222,7 +1236,7 @@ { "id": 18, "type": "timeseries", - "title": "Cluster ingress throughput", + "title": "Cluster Ingress Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1266,7 +1280,7 @@ { "id": 19, "type": "timeseries", - "title": "Cluster egress throughput", + "title": "Cluster Egress Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1310,7 +1324,7 @@ { "id": 20, "type": "timeseries", - "title": "Intra-cluster throughput", + "title": "Intra-Cluster Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1354,7 +1368,7 @@ { "id": 21, "type": "timeseries", - "title": "Root filesystem usage", + "title": "Root Filesystem Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1402,7 +1416,7 @@ { "id": 22, "type": "bargauge", - "title": "Nodes closest to full root disks", + "title": "Nodes Closest to Full Root Disks", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1456,7 +1470,7 @@ "calcs": [ "lastNotNull" ], - "fields": "/.*/", + "fields": "Value", "values": false } }, @@ -1479,9 +1493,10 @@ "list": [] }, "time": { - "from": "now-12h", + "from": "now-1h", "to": "now" }, + "refresh": "1m", "links": [ { "title": "Atlas Pods", diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index f519d14..ef616e0 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -7,7 +7,7 @@ { "id": 1, "type": "stat", - "title": "Problem pods", + "title": "Problem Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -127,7 +127,7 @@ { "id": 3, "type": "stat", - "title": "Stuck terminating (>10m)", + "title": "Stuck Terminating (>10m)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -187,7 +187,7 @@ { "id": 4, "type": "stat", - "title": "Control plane workloads", + "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -247,7 +247,7 @@ { "id": 5, "type": "table", - "title": "Pods not running", + "title": "Pods Not Running", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json index 6585794..1d07040 100644 --- a/services/monitoring/dashboards/atlas-storage.json +++ b/services/monitoring/dashboards/atlas-storage.json @@ -7,7 +7,7 @@ { "id": 1, "type": "stat", - "title": "Astreae usage", + "title": "Astreae Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -71,7 +71,7 @@ { "id": 2, "type": "stat", - "title": "Asteria usage", + "title": "Asteria Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -135,7 +135,7 @@ { "id": 3, "type": "stat", - "title": "Astreae free", + "title": "Astreae Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -195,7 +195,7 @@ { "id": 4, "type": "stat", - "title": "Asteria free", + "title": "Asteria Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -255,7 +255,7 @@ { "id": 5, "type": "timeseries", - "title": "Astreae per-node usage", + "title": "Astreae Per-Node Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -293,7 +293,7 @@ { "id": 6, "type": "timeseries", - "title": "Asteria per-node usage", + "title": "Asteria Per-Node Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -331,7 +331,7 @@ { "id": 7, "type": "timeseries", - "title": "Astreae usage history", + "title": "Astreae Usage History", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -368,7 +368,7 @@ { "id": 8, "type": "timeseries", - "title": "Asteria usage history", + "title": "Asteria Usage History", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 13262d6..1a86c73 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -69,7 +69,7 @@ data: { "id": 2, "type": "timeseries", - "title": "GPU util by namespace", + "title": "GPU Util by Namespace", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -106,7 +106,7 @@ data: { "id": 3, "type": "timeseries", - "title": "GPU util by node", + "title": "GPU Util by Node", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -143,7 +143,7 @@ data: { "id": 4, "type": "table", - "title": "Top pods by GPU util", + "title": "Top Pods by GPU Util", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 4b78fb9..fd1f5d6 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "stat", - "title": "Ingress traffic", + "title": "Ingress Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -76,7 +76,7 @@ data: { "id": 2, "type": "stat", - "title": "Egress traffic", + "title": "Egress Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -136,7 +136,7 @@ data: { "id": 3, "type": "stat", - "title": "Intra-cluster traffic", + "title": "Intra-Cluster Traffic", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -196,7 +196,7 @@ data: { "id": 4, "type": "stat", - "title": "Top router req/s", + "title": "Top Router req/s", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -257,7 +257,7 @@ data: { "id": 5, "type": "timeseries", - "title": "Per-node throughput", + "title": "Per-Node Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -294,7 +294,7 @@ data: { "id": 6, "type": "table", - "title": "Top namespaces", + "title": "Top Namespaces", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -330,7 +330,7 @@ data: { "id": 7, "type": "table", - "title": "Top pods", + "title": "Top Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -366,7 +366,7 @@ data: { "id": 8, "type": "timeseries", - "title": "Traefik routers (req/s)", + "title": "Traefik Routers (req/s)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -403,7 +403,7 @@ data: { "id": 9, "type": "timeseries", - "title": "Traefik entrypoints (req/s)", + "title": "Traefik Entrypoints (req/s)", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index c78e994..2facfed 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "stat", - "title": "Worker nodes ready", + "title": "Worker Nodes Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -77,7 +77,7 @@ data: { "id": 2, "type": "stat", - "title": "Control plane ready", + "title": "Control Plane Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -138,7 +138,7 @@ data: { "id": 3, "type": "stat", - "title": "Control plane workloads", + "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -278,7 +278,7 @@ data: { "id": 6, "type": "timeseries", - "title": "Control plane (incl. titan-db) CPU", + "title": "Control Plane (incl. titan-db) CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -315,7 +315,7 @@ data: { "id": 7, "type": "timeseries", - "title": "Control plane (incl. titan-db) RAM", + "title": "Control Plane (incl. titan-db) RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -352,7 +352,7 @@ data: { "id": 8, "type": "timeseries", - "title": "Root filesystem usage", + "title": "Root Filesystem Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 512adf9..6fbf7c9 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -10,7 +10,7 @@ data: { "uid": "atlas-overview", "title": "Atlas Overview", - "folderUid": "atlas-overview", + "folderUid": "overview", "editable": false, "annotations": { "list": [] @@ -19,7 +19,7 @@ data: { "id": 1, "type": "gauge", - "title": "Workers ready", + "title": "Workers Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -80,7 +80,7 @@ data: { "id": 2, "type": "gauge", - "title": "Control plane ready", + "title": "Control Plane Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -133,7 +133,7 @@ data: { "id": 3, "type": "stat", - "title": "Control plane workloads", + "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -207,8 +207,8 @@ data: }, { "id": 4, - "type": "gauge", - "title": "Problem pods", + "type": "stat", + "title": "Problem Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -227,8 +227,10 @@ data: ], "fieldConfig": { "defaults": { - "min": 0, - "max": 4, + "color": { + "mode": "palette-classic" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -249,11 +251,18 @@ data: "value": 3 } ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" } }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -261,9 +270,7 @@ data: "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "value" }, "links": [ { @@ -275,8 +282,8 @@ data: }, { "id": 5, - "type": "gauge", - "title": "Stuck terminating", + "type": "stat", + "title": "Stuck Terminating", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -295,8 +302,10 @@ data: ], "fieldConfig": { "defaults": { - "min": 0, - "max": 4, + "color": { + "mode": "palette-classic" + }, + "mappings": [], "thresholds": { "mode": "absolute", "steps": [ @@ -317,11 +326,18 @@ data: "value": 3 } ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" } }, "overrides": [] }, "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -329,9 +345,7 @@ data: "fields": "", "values": false }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false + "textMode": "value" }, "links": [ { @@ -628,7 +642,7 @@ data: { "id": 23, "type": "stat", - "title": "Astreae usage", + "title": "Astreae Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -699,7 +713,7 @@ data: { "id": 24, "type": "stat", - "title": "Asteria usage", + "title": "Asteria Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -770,7 +784,7 @@ data: { "id": 25, "type": "stat", - "title": "Astreae free", + "title": "Astreae Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -837,7 +851,7 @@ data: { "id": 26, "type": "stat", - "title": "Asteria free", + "title": "Asteria Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -904,7 +918,7 @@ data: { "id": 11, "type": "piechart", - "title": "Namespace CPU share", + "title": "Namespace CPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -957,7 +971,7 @@ data: { "id": 12, "type": "piechart", - "title": "Namespace GPU share", + "title": "Namespace GPU Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -970,7 +984,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1010,7 +1024,7 @@ data: { "id": 13, "type": "piechart", - "title": "Namespace RAM share", + "title": "Namespace RAM Share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1063,7 +1077,7 @@ data: { "id": 14, "type": "timeseries", - "title": "Worker node CPU", + "title": "Worker Node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1110,7 +1124,7 @@ data: { "id": 15, "type": "timeseries", - "title": "Worker node RAM", + "title": "Worker Node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1231,7 +1245,7 @@ data: { "id": 18, "type": "timeseries", - "title": "Cluster ingress throughput", + "title": "Cluster Ingress Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1275,7 +1289,7 @@ data: { "id": 19, "type": "timeseries", - "title": "Cluster egress throughput", + "title": "Cluster Egress Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1319,7 +1333,7 @@ data: { "id": 20, "type": "timeseries", - "title": "Intra-cluster throughput", + "title": "Intra-Cluster Throughput", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1363,7 +1377,7 @@ data: { "id": 21, "type": "timeseries", - "title": "Root filesystem usage", + "title": "Root Filesystem Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1411,7 +1425,7 @@ data: { "id": 22, "type": "bargauge", - "title": "Nodes closest to full root disks", + "title": "Nodes Closest to Full Root Disks", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1465,7 +1479,7 @@ data: "calcs": [ "lastNotNull" ], - "fields": "/.*/", + "fields": "Value", "values": false } }, @@ -1488,9 +1502,10 @@ data: "list": [] }, "time": { - "from": "now-12h", + "from": "now-1h", "to": "now" }, + "refresh": "1m", "links": [ { "title": "Atlas Pods", diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index 78beca5..f92adf1 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "stat", - "title": "Problem pods", + "title": "Problem Pods", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -136,7 +136,7 @@ data: { "id": 3, "type": "stat", - "title": "Stuck terminating (>10m)", + "title": "Stuck Terminating (>10m)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -196,7 +196,7 @@ data: { "id": 4, "type": "stat", - "title": "Control plane workloads", + "title": "Control Plane Workloads", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -256,7 +256,7 @@ data: { "id": 5, "type": "table", - "title": "Pods not running", + "title": "Pods Not Running", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml index 1bbf1ea..0a534f2 100644 --- a/services/monitoring/grafana-dashboard-storage.yaml +++ b/services/monitoring/grafana-dashboard-storage.yaml @@ -16,7 +16,7 @@ data: { "id": 1, "type": "stat", - "title": "Astreae usage", + "title": "Astreae Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -80,7 +80,7 @@ data: { "id": 2, "type": "stat", - "title": "Asteria usage", + "title": "Asteria Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -144,7 +144,7 @@ data: { "id": 3, "type": "stat", - "title": "Astreae free", + "title": "Astreae Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -204,7 +204,7 @@ data: { "id": 4, "type": "stat", - "title": "Asteria free", + "title": "Asteria Free", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -264,7 +264,7 @@ data: { "id": 5, "type": "timeseries", - "title": "Astreae per-node usage", + "title": "Astreae Per-Node Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -302,7 +302,7 @@ data: { "id": 6, "type": "timeseries", - "title": "Asteria per-node usage", + "title": "Asteria Per-Node Usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -340,7 +340,7 @@ data: { "id": 7, "type": "timeseries", - "title": "Astreae usage history", + "title": "Astreae Usage History", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -377,7 +377,7 @@ data: { "id": 8, "type": "timeseries", - "title": "Asteria usage history", + "title": "Asteria Usage History", "datasource": { "type": "prometheus", "uid": "atlas-vm" diff --git a/services/monitoring/grafana-folders.yaml b/services/monitoring/grafana-folders.yaml index c52b4e1..54b278f 100644 --- a/services/monitoring/grafana-folders.yaml +++ b/services/monitoring/grafana-folders.yaml @@ -10,8 +10,8 @@ data: folders.yaml: | apiVersion: 1 folders: - - uid: atlas-overview - title: Atlas Overview + - uid: overview + title: Overview permissions: - role: Viewer permission: View @@ -26,3 +26,10 @@ data: permission: View - role: Admin permission: Admin + - uid: oceanus-internal + title: Oceanus Internal + permissions: + - role: Editor + permission: View + - role: Admin + permission: Admin diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index cf56b27..2546dc1 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -256,6 +256,8 @@ spec: server: domain: metrics.bstein.dev root_url: https://metrics.bstein.dev/ + dashboards: + default_home_dashboard_path: /var/lib/grafana/dashboards/overview/atlas-overview.json auth.anonymous: hide_version: true users: @@ -290,7 +292,7 @@ spec: providers: - name: overview orgId: 1 - folder: Atlas Overview + folder: Overview type: file disableDeletion: false editable: false From 5df94a793736fb474a2026a78d3b6182e9eed1c3 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 14:56:36 -0300 Subject: [PATCH 65/71] monitoring: fix gpu share query and root bar labels --- scripts/dashboards_render_atlas.py | 4 ++-- services/monitoring/dashboards/atlas-gpu.json | 2 +- services/monitoring/dashboards/atlas-overview.json | 4 ++-- services/monitoring/grafana-dashboard-gpu.yaml | 2 +- services/monitoring/grafana-dashboard-overview.yaml | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 11bd2c8..78e759f 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -232,7 +232,7 @@ NAMESPACE_GPU_ALLOC = ( ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)' ) NAMESPACE_GPU_USAGE_SHARE = ( - 'avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)' + 'sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))' ) NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)' NAMESPACE_GPU_RAW = ( @@ -522,7 +522,7 @@ def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None): "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"], - "fields": "Value", + "fields": "", "values": False, }, }, diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 8c1367b..e67b3d2 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index b556594..8439407 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -975,7 +975,7 @@ }, "targets": [ { - "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1470,7 +1470,7 @@ "calcs": [ "lastNotNull" ], - "fields": "Value", + "fields": "", "values": false } }, diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 1a86c73..3af8717 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 6fbf7c9..4fcab70 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -984,7 +984,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "expr": "100 * ( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1479,7 +1479,7 @@ data: "calcs": [ "lastNotNull" ], - "fields": "Value", + "fields": "", "values": false } }, From b93636ecb9b7bed891b6c47fa6b9029b1ab509db Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 15:12:16 -0300 Subject: [PATCH 66/71] monitoring: shrink hottest node row height --- scripts/dashboards_render_atlas.py | 2 +- services/monitoring/dashboards/atlas-overview.json | 8 ++++---- services/monitoring/grafana-dashboard-overview.yaml | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 78e759f..dd96c35 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -686,7 +686,7 @@ def build_overview(): panel_id, title, f"{expr}", - {"h": 5, "w": 6, "x": 6 * idx, "y": 5}, + {"h": 3, "w": 6, "x": 6 * idx, "y": 5}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="name_and_value", diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 8439407..93a7745 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -355,7 +355,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 0, "y": 5 @@ -428,7 +428,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 6, "y": 5 @@ -501,7 +501,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 12, "y": 5 @@ -570,7 +570,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 18, "y": 5 diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 4fcab70..363e481 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -364,7 +364,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 0, "y": 5 @@ -437,7 +437,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 6, "y": 5 @@ -510,7 +510,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 12, "y": 5 @@ -579,7 +579,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 5, + "h": 3, "w": 6, "x": 18, "y": 5 From ace383bedd1732e9353cfaf0dca5f70cb28d6b5c Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 15:15:21 -0300 Subject: [PATCH 67/71] monitoring: expand worker/control/root rows --- scripts/dashboards_render_atlas.py | 12 +++++------ .../monitoring/dashboards/atlas-overview.json | 20 +++++++++---------- .../grafana-dashboard-overview.yaml | 20 +++++++++---------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index dd96c35..8829ca1 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -746,7 +746,7 @@ def build_overview(): 14, "Worker Node CPU", node_cpu_expr(worker_filter), - {"h": 8, "w": 12, "x": 0, "y": 32}, + {"h": 12, "w": 12, "x": 0, "y": 32}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -760,7 +760,7 @@ def build_overview(): 15, "Worker Node RAM", node_mem_expr(worker_filter), - {"h": 8, "w": 12, "x": 12, "y": 32}, + {"h": 12, "w": 12, "x": 12, "y": 32}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -775,7 +775,7 @@ def build_overview(): 16, "Control plane CPU", node_cpu_expr(CONTROL_REGEX), - {"h": 7, "w": 12, "x": 0, "y": 40}, + {"h": 10, "w": 12, "x": 0, "y": 44}, unit="percent", legend="{{node}}", legend_display="table", @@ -787,7 +787,7 @@ def build_overview(): 17, "Control plane RAM", node_mem_expr(CONTROL_REGEX), - {"h": 7, "w": 12, "x": 12, "y": 40}, + {"h": 10, "w": 12, "x": 12, "y": 44}, unit="percent", legend="{{node}}", legend_display="table", @@ -840,7 +840,7 @@ def build_overview(): 21, "Root Filesystem Usage", root_usage_expr(), - {"h": 8, "w": 12, "x": 0, "y": 47}, + {"h": 16, "w": 12, "x": 0, "y": 54}, unit="percent", legend="{{node}}", legend_calcs=["last"], @@ -855,7 +855,7 @@ def build_overview(): 22, "Nodes Closest to Full Root Disks", f"topk(8, {root_usage_expr()})", - {"h": 8, "w": 12, "x": 12, "y": 47}, + {"h": 16, "w": 12, "x": 12, "y": 54}, unit="percent", links=link_to("atlas-storage"), ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 93a7745..9800f1c 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1074,7 +1074,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 12, "w": 12, "x": 0, "y": 32 @@ -1121,7 +1121,7 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 12, "w": 12, "x": 12, "y": 32 @@ -1168,10 +1168,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 10, "w": 12, "x": 0, - "y": 40 + "y": 44 }, "targets": [ { @@ -1205,10 +1205,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 10, "w": 12, "x": 12, - "y": 40 + "y": 44 }, "targets": [ { @@ -1374,10 +1374,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 16, "w": 12, "x": 0, - "y": 47 + "y": 54 }, "targets": [ { @@ -1422,10 +1422,10 @@ "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 16, "w": 12, "x": 12, - "y": 47 + "y": 54 }, "targets": [ { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 363e481..7b91758 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1083,7 +1083,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 12, "w": 12, "x": 0, "y": 32 @@ -1130,7 +1130,7 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 12, "w": 12, "x": 12, "y": 32 @@ -1177,10 +1177,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 10, "w": 12, "x": 0, - "y": 40 + "y": 44 }, "targets": [ { @@ -1214,10 +1214,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 10, "w": 12, "x": 12, - "y": 40 + "y": 44 }, "targets": [ { @@ -1383,10 +1383,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 16, "w": 12, "x": 0, - "y": 47 + "y": 54 }, "targets": [ { @@ -1431,10 +1431,10 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 8, + "h": 16, "w": 12, "x": 12, - "y": 47 + "y": 54 }, "targets": [ { From 6eba26b359f3989c1fdfef1c02cacc9969cd634d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 15:21:02 -0300 Subject: [PATCH 68/71] monitoring: show top12 root disks --- scripts/dashboards_render_atlas.py | 2 +- services/monitoring/dashboards/atlas-overview.json | 2 +- services/monitoring/grafana-dashboard-overview.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 8829ca1..93de006 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -854,7 +854,7 @@ def build_overview(): bargauge_panel( 22, "Nodes Closest to Full Root Disks", - f"topk(8, {root_usage_expr()})", + f"topk(12, {root_usage_expr()})", {"h": 16, "w": 12, "x": 12, "y": 54}, unit="percent", links=link_to("atlas-storage"), diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 9800f1c..9eda81d 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1429,7 +1429,7 @@ }, "targets": [ { - "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 7b91758..928098e 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1438,7 +1438,7 @@ data: }, "targets": [ { - "expr": "topk(8, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", + "expr": "topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", "legendFormat": "{{node}}" } From 839fb94836dec85164c3cf680e00732707ea86bf Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 17:01:32 -0300 Subject: [PATCH 69/71] notes: update monitoring and next steps --- AGENTS.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index 05838aa..d660e75 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -40,3 +40,14 @@ Repository Guidelines - Atlas Overview panels are paired with internal dashboards (pods, nodes, storage, network, GPU). A new `atlas-gpu` internal dashboard holds the detailed GPU metrics that feed the overview share pie. - Old Grafana folders (`Atlas Storage`, `Atlas SRE`, `Atlas Public`, `Atlas Nodes`) should be removed in Grafana UI when convenient; only `Atlas Overview` and `Atlas Internal` should remain provisioned. - Future work: add a separate generator (e.g., `dashboards_render_oceanus.py`) for SUI/oceanus validation dashboards, mirroring the atlas pattern of internal dashboards feeding a public overview. + +## Monitoring state (2025-12-03) +- dcgm-exporter DaemonSet pulls `registry.bstein.dev/monitoring/dcgm-exporter:4.4.2-4.7.0-ubuntu22.04` with nvidia runtime/imagePullSecret; titan-24 exports metrics, titan-22 remains NotReady. +- Atlas Overview is the Grafana home (1h range, 1m refresh), Overview folder UID `overview`, internal folder `atlas-internal` (oceanus-internal stub). +- Panels standardized via generator; hottest row compressed, worker/control rows taller, root disk row taller and top12 bar gauge with labels. GPU share pie uses 1h avg_over_time to persist idle activity. +- Internal dashboards are provisioned without Viewer role; if anonymous still sees them, restart Grafana and tighten auth if needed. + +## Upcoming priorities (SSO/storage/mail) +- Establish SSO (Keycloak or similar) and federate Grafana, Gitea, Zot, Nextcloud, Pegasus/Jellyfin; keep Vaultwarden separate until safe. +- Add Nextcloud (limit to rpi5 workers) with office suite; integrate with SSO; plan storage class and ingress. +- Plan mail: mostly self-hosted, relay through trusted provider for outbound; integrate with services (Nextcloud, Vaultwarden, etc.) for notifications and account flows. From 762aa7bb0f57cabebff4b61f8a1cd09fa495cd9e Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 17:14:45 -0300 Subject: [PATCH 70/71] notes: add sso plan sketch --- AGENTS.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index d660e75..664fc6c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -51,3 +51,14 @@ Repository Guidelines - Establish SSO (Keycloak or similar) and federate Grafana, Gitea, Zot, Nextcloud, Pegasus/Jellyfin; keep Vaultwarden separate until safe. - Add Nextcloud (limit to rpi5 workers) with office suite; integrate with SSO; plan storage class and ingress. - Plan mail: mostly self-hosted, relay through trusted provider for outbound; integrate with services (Nextcloud, Vaultwarden, etc.) for notifications and account flows. + +## SSO plan sketch (2025-12-03) +- IdP: use Keycloak (preferred) in a new `sso` namespace, Bitnami or codecentric chart with Postgres backing store (single PVC), ingress `sso.bstein.dev`, admin user bound to brad@bstein.dev; stick with local DB initially (no external IdP). +- Auth flow goals: Grafana (OIDC), Gitea (OAuth2/Keycloak), Zot (via Traefik forward-auth/oauth2-proxy), Jellyfin/Pegasus via Jellyfin OAuth/OpenID plugin (map existing usernames; run migration to pre-create users in Keycloak with same usernames/emails and temporary passwords), Pegasus keeps using Jellyfin tokens. +- Steps to implement: + 1) Add service folder `services/keycloak/` (namespace, PVC, HelmRelease, ingress, secret for admin creds). Verify with kustomize + Flux reconcile. + 2) Seed realm `atlas` with users (import CSV/realm). Create client for Grafana (public/implicit), Gitea (confidential), and a “jellyfin” client for the OAuth plugin; set email for brad@bstein.dev as admin. + 3) Reconfigure Grafana to OIDC (disable anonymous to internal folders, leave Overview public via folder permissions). Reconfigure Gitea to OIDC (app.ini). + 4) Add Traefik forward-auth (oauth2-proxy) in front of Zot and any other services needing headers-based auth. + 5) Deploy Jellyfin OpenID plugin; map Keycloak users to existing Jellyfin usernames; communicate password reset path. +- Migration caution: do not delete existing local creds until SSO validated; keep Pegasus working via Jellyfin tokens during transition. From e80505a7730ea1847011a9e9130445b53721f01d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 17:36:37 -0300 Subject: [PATCH 71/71] notes: add postgres centralization guidance --- AGENTS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index 664fc6c..a8d49c8 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -62,3 +62,7 @@ Repository Guidelines 4) Add Traefik forward-auth (oauth2-proxy) in front of Zot and any other services needing headers-based auth. 5) Deploy Jellyfin OpenID plugin; map Keycloak users to existing Jellyfin usernames; communicate password reset path. - Migration caution: do not delete existing local creds until SSO validated; keep Pegasus working via Jellyfin tokens during transition. + +## Postgres centralization (2025-12-03) +- Prefer a shared in-cluster Postgres deployment with per-service databases to reduce resource sprawl on Pi nodes. Use it for services that can easily point at an external DB. +- Candidates to migrate to shared Postgres: Keycloak (realm DB), Gitea (git DB), Nextcloud (app DB), possibly Grafana (if persistence needed beyond current provisioner), Jitsi prosody/JVB state (if external DB supported). Keep tightly-coupled or lightweight embedded DBs as-is when migration is painful or not supported.