From eed67b3db01c493bce6ba5aa6ef8e70d1ec4a041 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Tue, 2 Dec 2025 13:16:00 -0300 Subject: [PATCH] monitoring: regen dashboards with gpu details --- AGENTS.md | 42 ++++ ...shboards.py => dashboards_render_atlas.py} | 177 ++++++++++++---- services/monitoring/dashboards/atlas-gpu.json | 184 +++++++++++++++++ .../monitoring/dashboards/atlas-network.json | 5 +- .../monitoring/dashboards/atlas-overview.json | 14 +- .../monitoring/grafana-dashboard-gpu.yaml | 193 ++++++++++++++++++ .../monitoring/grafana-dashboard-network.yaml | 5 +- .../grafana-dashboard-overview.yaml | 14 +- services/monitoring/helmrelease.yaml | 9 + services/monitoring/kustomization.yaml | 1 + 10 files changed, 584 insertions(+), 60 deletions(-) create mode 100644 AGENTS.md rename scripts/{render_dashboards.py => dashboards_render_atlas.py} (90%) create mode 100644 services/monitoring/dashboards/atlas-gpu.json create mode 100644 services/monitoring/grafana-dashboard-gpu.yaml diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..05838aa --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,42 @@ + + +Repository Guidelines + +## Project Structure & Module Organization +- `infrastructure/`: cluster-scoped building blocks (core, flux-system, traefik, longhorn). Add new platform features by mirroring this layout. +- `services/`: workload manifests per app (`services/gitea/`, etc.) with `kustomization.yaml` plus one file per kind; keep diffs small and focused. +- `dockerfiles/` hosts bespoke images, while `scripts/` stores operational Fish/Bash helpers—extend these directories instead of relying on ad-hoc commands. + +## Build, Test, and Development Commands +- `kustomize build services/` (or `kubectl kustomize ...`) renders manifests exactly as Flux will. +- `kubectl apply --server-side --dry-run=client -k services/` checks schema compatibility without touching the cluster. +- `flux reconcile kustomization --namespace flux-system --with-source` pulls the latest Git state after merges or hotfixes. +- `fish scripts/flux_hammer.fish --help` explains the recovery tool; read it before running against production workloads. + +## Coding Style & Naming Conventions +- YAML uses two-space indents; retain the leading path comment (e.g. `# services/gitea/deployment.yaml`) to speed code review. +- Keep resource names lowercase kebab-case, align labels/selectors, and mirror namespaces with directory names. +- List resources in `kustomization.yaml` from namespace/config, through storage, then workloads and networking for predictable diffs. +- Scripts start with `#!/usr/bin/env fish` or bash, stay executable, and follow snake_case names such as `flux_hammer.fish`. + +## Testing Guidelines +- Run `kustomize build` and the dry-run apply for every service you touch; capture failures before opening a PR. +- `flux diff kustomization --path services/` previews reconciliations—link notable output when behavior shifts. +- Docker edits: `docker build -f dockerfiles/Dockerfile.monerod .` (swap the file you changed) to verify image builds. + +## Commit & Pull Request Guidelines +- Keep commit subjects short, present-tense, and optionally scoped (`gpu(titan-24): add RuntimeClass`); squash fixups before review. +- Describe linked issues, affected services, and required operator steps (e.g. `flux reconcile kustomization services-gitea`) in the PR body. +- Focus each PR on one kustomization or service and update `infrastructure/flux-system` when Flux must track new folders. +- Record the validation you ran (dry-runs, diffs, builds) and add screenshots only when ingress or UI behavior changes. + +## Security & Configuration Tips +- Never commit credentials; use Vault workflows (`services/vault/`) or SOPS-encrypted manifests wired through `infrastructure/flux-system`. +- Node selectors and tolerations gate workloads to hardware like `hardware: rpi4`; confirm labels before scaling or renaming nodes. +- Pin external images by digest or rely on Flux image automation to follow approved tags and avoid drift. + +## Dashboard roadmap / context (2025-12-02) +- Atlas dashboards are generated via `scripts/dashboards_render_atlas.py --build`, which writes JSON under `services/monitoring/dashboards/` and ConfigMaps under `services/monitoring/`. Keep the Grafana manifests in sync by regenerating after edits. +- Atlas Overview panels are paired with internal dashboards (pods, nodes, storage, network, GPU). A new `atlas-gpu` internal dashboard holds the detailed GPU metrics that feed the overview share pie. +- Old Grafana folders (`Atlas Storage`, `Atlas SRE`, `Atlas Public`, `Atlas Nodes`) should be removed in Grafana UI when convenient; only `Atlas Overview` and `Atlas Internal` should remain provisioned. +- Future work: add a separate generator (e.g., `dashboards_render_oceanus.py`) for SUI/oceanus validation dashboards, mirroring the atlas pattern of internal dashboards feeding a public overview. diff --git a/scripts/render_dashboards.py b/scripts/dashboards_render_atlas.py similarity index 90% rename from scripts/render_dashboards.py rename to scripts/dashboards_render_atlas.py index 812a931..97070d2 100644 --- a/scripts/render_dashboards.py +++ b/scripts/dashboards_render_atlas.py @@ -2,8 +2,8 @@ """Generate Atlas Grafana dashboards and render them into ConfigMaps. Usage: - scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps - scripts/render_dashboards.py # re-render ConfigMaps from JSON + scripts/dashboards_render_atlas.py --build # rebuild JSON + ConfigMaps + scripts/dashboards_render_atlas.py # re-render ConfigMaps from JSON """ import argparse @@ -198,7 +198,6 @@ STUCK_TERMINATING_EXPR = ( ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)' '))' ) - PROBLEM_TABLE_EXPR = ( "(time() - kube_pod_created{pod!=\"\"}) " "* on(namespace,pod) group_left(node) kube_pod_info " @@ -489,6 +488,47 @@ def pie_panel(panel_id, title, expr, grid): } +def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None): + """Return a bar gauge panel with label-aware reduction.""" + panel = { + "id": panel_id, + "type": "bargauge", + "title": title, + "datasource": PROM_DS, + "gridPos": grid, + "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}], + "fieldConfig": { + "defaults": { + "unit": unit, + "min": 0, + "max": 100 if unit == "percent" else None, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 50}, + {"color": "orange", "value": 70}, + {"color": "red", "value": 85}, + ], + }, + }, + "overrides": [], + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "/.*/", + "values": False, + }, + }, + } + if links: + panel["links"] = links + return panel + + def text_panel(panel_id, title, content, grid): return { "id": panel_id, @@ -554,6 +594,7 @@ def build_overview(): link_to("atlas-pods"), ), ] + def gauge_grid(idx): width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4 x = sum(GAUGE_WIDTHS[:idx]) @@ -806,38 +847,14 @@ def build_overview(): ) ) panels.append( - { - "id": 22, - "type": "bargauge", - "title": "Nodes closest to full root disks", - "datasource": PROM_DS, - "gridPos": {"h": 8, "w": 12, "x": 12, "y": 47}, - "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}], - "fieldConfig": { - "defaults": { - "unit": "percent", - "min": 0, - "max": 100, - "thresholds": { - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 50}, - {"color": "orange", "value": 70}, - {"color": "red", "value": 85}, - ], - }, - }, - "overrides": [], - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, - }, - "links": link_to("atlas-storage"), - "transformations": [{"id": "labelsToFields", "options": {}}], - } + bargauge_panel( + 22, + "Nodes closest to full root disks", + f"topk(8, {root_usage_expr()})", + {"h": 8, "w": 12, "x": 12, "y": 47}, + unit="percent", + links=link_to("atlas-storage"), + ) ) return { @@ -857,6 +874,7 @@ def build_overview(): {"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False}, {"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False}, {"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False}, + {"title": "Atlas GPU", "type": "dashboard", "dashboardUid": "atlas-gpu", "keepTime": False}, ], } @@ -1179,13 +1197,31 @@ def build_storage_dashboard(): def build_network_dashboard(): panels = [] panels.append( - stat_panel(1, "Ingress traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="Bps") + stat_panel( + 1, + "Ingress traffic", + NET_INGRESS_EXPR, + {"h": 4, "w": 8, "x": 0, "y": 0}, + unit="Bps", + ) ) panels.append( - stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps") + stat_panel( + 2, + "Egress traffic", + NET_EGRESS_EXPR, + {"h": 4, "w": 8, "x": 8, "y": 0}, + unit="Bps", + ) ) panels.append( - stat_panel(3, "Intra-cluster traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 0}, unit="Bps") + stat_panel( + 3, + "Intra-cluster traffic", + NET_INTERNAL_EXPR, + {"h": 4, "w": 8, "x": 16, "y": 0}, + unit="Bps", + ) ) panels.append( stat_panel( @@ -1195,14 +1231,13 @@ def build_network_dashboard(): {"h": 4, "w": 8, "x": 0, "y": 4}, unit="req/s", legend="{{router}}", - instant=True, ) ) panels.append( timeseries_panel( 5, "Per-node throughput", - node_net_expr(), + f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})', {"h": 8, "w": 24, "x": 0, "y": 8}, unit="Bps", legend="{{node}}", @@ -1270,6 +1305,64 @@ def build_network_dashboard(): } +def build_gpu_dashboard(): + panels = [] + panels.append( + pie_panel( + 1, + "Namespace GPU share", + namespace_gpu_share_expr(), + {"h": 8, "w": 12, "x": 0, "y": 0}, + ) + ) + panels.append( + timeseries_panel( + 2, + "GPU util by namespace", + NAMESPACE_GPU_USAGE, + {"h": 8, "w": 12, "x": 12, "y": 0}, + unit="percent", + legend="{{namespace}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 3, + "GPU util by node", + 'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})', + {"h": 8, "w": 12, "x": 0, "y": 8}, + unit="percent", + legend="{{Hostname}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + table_panel( + 4, + "Top pods by GPU util", + 'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))', + {"h": 8, "w": 12, "x": 12, "y": 8}, + unit="percent", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + return { + "uid": "atlas-gpu", + "title": "Atlas GPU", + "folderUid": PRIVATE_FOLDER, + "editable": True, + "panels": panels, + "time": {"from": "now-12h", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "gpu"], + } + + DASHBOARDS = { "atlas-overview": { "builder": build_overview, @@ -1291,6 +1384,10 @@ DASHBOARDS = { "builder": build_network_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml", }, + "atlas-gpu": { + "builder": build_gpu_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml", + }, } diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json new file mode 100644 index 0000000..da235a5 --- /dev/null +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -0,0 +1,184 @@ +{ + "uid": "atlas-gpu", + "title": "Atlas GPU", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "piechart", + "title": "Namespace GPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 2, + "type": "timeseries", + "title": "GPU util by namespace", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "timeseries", + "title": "GPU util by node", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "targets": [ + { + "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", + "refId": "A", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 4, + "type": "table", + "title": "Top pods by GPU util", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "targets": [ + { + "expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "gpu" + ] +} diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 9005eb9..f2291b7 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -202,8 +202,7 @@ { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", - "legendFormat": "{{router}}", - "instant": true + "legendFormat": "{{router}}" } ], "fieldConfig": { @@ -262,7 +261,7 @@ }, "targets": [ { - "expr": "avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 93a246b..4e3c357 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1456,7 +1456,7 @@ "calcs": [ "lastNotNull" ], - "fields": "", + "fields": "/.*/", "values": false } }, @@ -1466,12 +1466,6 @@ "url": "/d/atlas-storage", "targetBlank": true } - ], - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } ] } ], @@ -1512,6 +1506,12 @@ "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": false + }, + { + "title": "Atlas GPU", + "type": "dashboard", + "dashboardUid": "atlas-gpu", + "keepTime": false } ] } diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml new file mode 100644 index 0000000..13262d6 --- /dev/null +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -0,0 +1,193 @@ +# services/monitoring/grafana-dashboard-gpu.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-gpu + labels: + grafana_dashboard: "1" +data: + atlas-gpu.json: | + { + "uid": "atlas-gpu", + "title": "Atlas GPU", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "piechart", + "title": "Namespace GPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "100 * ( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ) / clamp_min(sum( ( (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) ) and on(namespace) ( (topk(10, ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) ) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace) * 0) * 100)) >= bool 0) ) ), 1)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "displayLabels": [ + "percent" + ], + "tooltip": { + "mode": "single" + }, + "colorScheme": "interpolateSpectral", + "colorBy": "value", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 2, + "type": "timeseries", + "title": "GPU util by namespace", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}) by (namespace)", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 3, + "type": "timeseries", + "title": "GPU util by node", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "targets": [ + { + "expr": "sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=\"\"})", + "refId": "A", + "legendFormat": "{{Hostname}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 4, + "type": "table", + "title": "Top pods by GPU util", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "targets": [ + { + "expr": "topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=\"\"}) by (namespace,pod,Hostname))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "gpu" + ] + } diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index d2372de..4b78fb9 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -211,8 +211,7 @@ data: { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", - "legendFormat": "{{router}}", - "instant": true + "legendFormat": "{{router}}" } ], "fieldConfig": { @@ -271,7 +270,7 @@ data: }, "targets": [ { - "expr": "avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]) + rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "expr": "avg by (node) ((sum(rate(node_network_transmit_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0) + sum(rate(node_network_receive_bytes_total{device!~\"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*\"}[5m])) or on() vector(0)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index ebd9b2b..512adf9 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1465,7 +1465,7 @@ data: "calcs": [ "lastNotNull" ], - "fields": "", + "fields": "/.*/", "values": false } }, @@ -1475,12 +1475,6 @@ data: "url": "/d/atlas-storage", "targetBlank": true } - ], - "transformations": [ - { - "id": "labelsToFields", - "options": {} - } ] } ], @@ -1521,6 +1515,12 @@ data: "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": false + }, + { + "title": "Atlas GPU", + "type": "dashboard", + "dashboardUid": "atlas-gpu", + "keepTime": false } ] } diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 5a8f1ba..cf56b27 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -320,6 +320,14 @@ spec: editable: true options: path: /var/lib/grafana/dashboards/storage + - name: gpu + orgId: 1 + folder: Atlas Internal + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/gpu - name: network orgId: 1 folder: Atlas Internal @@ -333,6 +341,7 @@ spec: pods: grafana-dashboard-pods nodes: grafana-dashboard-nodes storage: grafana-dashboard-storage + gpu: grafana-dashboard-gpu network: grafana-dashboard-network extraConfigmapMounts: - name: grafana-folders diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 3164862..a50a1c1 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -10,6 +10,7 @@ resources: - grafana-dashboard-nodes.yaml - grafana-dashboard-storage.yaml - grafana-dashboard-network.yaml + - grafana-dashboard-gpu.yaml - dcgm-exporter.yaml - grafana-folders.yaml - helmrelease.yaml