From 5093f77c0a34f04e2d9946e2b6125fed90cbadf0 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Thu, 1 Jan 2026 14:44:33 -0300 Subject: [PATCH] monitoring: per-panel namespace share filters --- scripts/dashboards_render_atlas.py | 130 +++++++++++---- services/ai-llm/deployment.yaml | 2 + services/communication/element-rendered.yaml | 21 --- services/monitoring/dashboards/atlas-gpu.json | 106 +++++++++++-- .../monitoring/dashboards/atlas-nodes.json | 2 +- .../monitoring/dashboards/atlas-overview.json | 150 ++++++++++++++++-- .../monitoring/dashboards/atlas-pods.json | 2 +- .../monitoring/grafana-dashboard-gpu.yaml | 106 +++++++++++-- .../monitoring/grafana-dashboard-nodes.yaml | 2 +- .../grafana-dashboard-overview.yaml | 150 ++++++++++++++++-- .../monitoring/grafana-dashboard-pods.yaml | 2 +- 11 files changed, 568 insertions(+), 105 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 58da298..34a108a 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -9,6 +9,7 @@ Usage: import argparse import json import textwrap +import urllib.parse from pathlib import Path # --------------------------------------------------------------------------- @@ -80,7 +81,7 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES) WORKER_TOTAL = len(WORKER_NODES) CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" -CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system" +CP_ALLOWED_NS = "(^kube.*|.*-system$|^traefik$|^monitoring$)" LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4] CONTROL_WORKLOADS_EXPR = ( @@ -170,22 +171,43 @@ def node_io_expr(scope=""): return scoped_node_expr(base, scope) +def namespace_selector(scope_var): + return f'namespace!="",pod!="",container!="",{scope_var}' + + +def namespace_gpu_selector(scope_var): + return f'namespace!="",pod!="",{scope_var}' + + +def namespace_cpu_raw(scope_var): + return f"sum(rate(container_cpu_usage_seconds_total{{{namespace_selector(scope_var)}}}[5m])) by (namespace)" + + +def namespace_ram_raw(scope_var): + return f"sum(container_memory_working_set_bytes{{{namespace_selector(scope_var)}}}) by (namespace)" + + +def namespace_gpu_usage_instant(scope_var): + return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)" + + def namespace_share_expr(resource_expr): total = f"clamp_min(sum( {resource_expr} ), 1)" return f"100 * ( {resource_expr} ) / {total}" -def namespace_cpu_share_expr(): - return namespace_share_expr(NAMESPACE_CPU_RAW) +def namespace_cpu_share_expr(scope_var): + return namespace_share_expr(namespace_cpu_raw(scope_var)) -def namespace_ram_share_expr(): - return namespace_share_expr(NAMESPACE_RAM_RAW) +def namespace_ram_share_expr(scope_var): + return namespace_share_expr(namespace_ram_raw(scope_var)) -def namespace_gpu_share_expr(): - total = f"(sum({NAMESPACE_GPU_USAGE_INSTANT}) or on() vector(0))" - share = f"100 * ({NAMESPACE_GPU_USAGE_INSTANT}) / clamp_min({total}, 1)" +def namespace_gpu_share_expr(scope_var): + usage = namespace_gpu_usage_instant(scope_var) + total = f"(sum({usage}) or on() vector(0))" + share = f"100 * ({usage}) / clamp_min({total}, 1)" idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)" return f"({share}) or ({idle})" @@ -272,20 +294,12 @@ STUCK_TABLE_EXPR = ( ")" ) -NAMESPACE_SCOPE_WORKLOAD = 'namespace!~"(^kube.*|.*-system$|^traefik$)"' +NAMESPACE_SCOPE_WORKLOAD = 'namespace!~"(^kube.*|.*-system$|^traefik$|^monitoring$)"' NAMESPACE_SCOPE_ALL = 'namespace=~".*"' -NAMESPACE_SCOPE_INFRA = 'namespace=~"(^kube.*|.*-system$|^traefik$)"' -NAMESPACE_SCOPE_VAR = "$namespace_scope" -NAMESPACE_SELECTOR = f'namespace!="",pod!="",container!="",{NAMESPACE_SCOPE_VAR}' -NAMESPACE_GPU_SELECTOR = f'namespace!="",pod!="",{NAMESPACE_SCOPE_VAR}' - -NAMESPACE_CPU_RAW = ( - f'sum(rate(container_cpu_usage_seconds_total{{{NAMESPACE_SELECTOR}}}[5m])) by (namespace)' -) -NAMESPACE_RAM_RAW = f'sum(container_memory_working_set_bytes{{{NAMESPACE_SELECTOR}}}) by (namespace)' +NAMESPACE_SCOPE_INFRA = 'namespace=~"(^kube.*|.*-system$|^traefik$|^monitoring$)"' +NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"] GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) -NAMESPACE_GPU_USAGE_INSTANT = f'sum(DCGM_FI_DEV_GPU_UTIL{{{NAMESPACE_GPU_SELECTOR}}}) by (namespace)' TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" TRAEFIK_NET_INGRESS = ( 'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' @@ -536,9 +550,9 @@ def table_panel( return panel -def pie_panel(panel_id, title, expr, grid): +def pie_panel(panel_id, title, expr, grid, *, links=None, description=None): """Return a pie chart panel with readable namespace labels.""" - return { + panel = { "id": panel_id, "type": "piechart", "title": title, @@ -562,9 +576,14 @@ def pie_panel(panel_id, title, expr, grid): "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, } + if links: + panel["links"] = links + if description: + panel["description"] = description + return panel -def namespace_scope_variable(): +def namespace_scope_variable(var_name, label): options = [ { "text": "workload namespaces only", @@ -587,13 +606,13 @@ def namespace_scope_variable(): + NAMESPACE_SCOPE_INFRA ) return { - "name": "namespace_scope", - "label": "Namespace filter", + "name": var_name, + "label": label, "type": "custom", "query": query, "current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True}, "options": options, - "hide": 0, + "hide": 2, "multi": False, "includeAll": False, "refresh": 1, @@ -602,6 +621,28 @@ def namespace_scope_variable(): } +def namespace_scope_links(var_name): + def with_value(value): + encoded = urllib.parse.quote(value, safe="") + params = [] + for other in NAMESPACE_SCOPE_VARS: + if other == var_name: + params.append(f"var-{other}={encoded}") + else: + params.append(f"var-{other}=${{{other}}}") + return "?" + "&".join(params) + + return [ + {"title": "Workload namespaces only", "url": with_value(NAMESPACE_SCOPE_WORKLOAD), "targetBlank": False}, + {"title": "All namespaces", "url": with_value(NAMESPACE_SCOPE_ALL), "targetBlank": False}, + { + "title": "Infrastructure namespaces only", + "url": with_value(NAMESPACE_SCOPE_INFRA), + "targetBlank": False, + }, + ] + + def bargauge_panel( panel_id, title, @@ -890,28 +931,38 @@ def build_overview(): ) ) + cpu_scope = "$namespace_scope_cpu" + gpu_scope = "$namespace_scope_gpu" + ram_scope = "$namespace_scope_ram" + panels.append( pie_panel( 11, "Namespace CPU Share", - namespace_cpu_share_expr(), + namespace_cpu_share_expr(cpu_scope), {"h": 9, "w": 8, "x": 0, "y": 16}, + links=namespace_scope_links("namespace_scope_cpu"), + description="Use panel links to switch namespace scope.", ) ) panels.append( pie_panel( 12, "Namespace GPU Share", - namespace_gpu_share_expr(), + namespace_gpu_share_expr(gpu_scope), {"h": 9, "w": 8, "x": 8, "y": 16}, + links=namespace_scope_links("namespace_scope_gpu"), + description="Use panel links to switch namespace scope.", ) ) panels.append( pie_panel( 13, "Namespace RAM Share", - namespace_ram_share_expr(), + namespace_ram_share_expr(ram_scope), {"h": 9, "w": 8, "x": 16, "y": 16}, + links=namespace_scope_links("namespace_scope_ram"), + description="Use panel links to switch namespace scope.", ) ) @@ -1077,7 +1128,13 @@ def build_overview(): "schemaVersion": 39, "style": "dark", "tags": ["atlas", "overview"], - "templating": {"list": [namespace_scope_variable()]}, + "templating": { + "list": [ + namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"), + namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"), + namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"), + ] + }, "time": {"from": "now-1h", "to": "now"}, "refresh": "1m", "links": [], @@ -1718,19 +1775,22 @@ def build_network_dashboard(): def build_gpu_dashboard(): panels = [] + gpu_scope = "$namespace_scope_gpu" panels.append( pie_panel( 1, "Namespace GPU Share", - namespace_gpu_share_expr(), + namespace_gpu_share_expr(gpu_scope), {"h": 8, "w": 12, "x": 0, "y": 0}, + links=namespace_scope_links("namespace_scope_gpu"), + description="Use panel links to switch namespace scope.", ) ) panels.append( timeseries_panel( 2, "GPU Util by Namespace", - NAMESPACE_GPU_USAGE_INSTANT, + namespace_gpu_usage_instant(gpu_scope), {"h": 8, "w": 12, "x": 12, "y": 0}, unit="percent", legend="{{namespace}}", @@ -1771,7 +1831,13 @@ def build_gpu_dashboard(): "schemaVersion": 39, "style": "dark", "tags": ["atlas", "gpu"], - "templating": {"list": [namespace_scope_variable()]}, + "templating": { + "list": [ + namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"), + namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"), + namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"), + ] + }, } diff --git a/services/ai-llm/deployment.yaml b/services/ai-llm/deployment.yaml index b74dc0a..0bdc275 100644 --- a/services/ai-llm/deployment.yaml +++ b/services/ai-llm/deployment.yaml @@ -7,6 +7,8 @@ metadata: spec: replicas: 1 revisionHistoryLimit: 2 + strategy: + type: Recreate selector: matchLabels: app: ollama diff --git a/services/communication/element-rendered.yaml b/services/communication/element-rendered.yaml index c0b03c1..f04dda2 100644 --- a/services/communication/element-rendered.yaml +++ b/services/communication/element-rendered.yaml @@ -200,24 +200,3 @@ spec: port: number: 80 pathType: Prefix ---- -# Source: element-web/templates/tests/test-connection.yaml -apiVersion: v1 -kind: Pod -metadata: - name: "othrys-element-element-web-test-connection" - labels: - helm.sh/chart: element-web-1.4.26 - app.kubernetes.io/name: element-web - app.kubernetes.io/instance: othrys-element - app.kubernetes.io/version: "1.12.6" - app.kubernetes.io/managed-by: Helm - annotations: - "helm.sh/hook": test-success -spec: - containers: - - name: wget - image: busybox - command: ['wget'] - args: ['othrys-element-element-web:80'] - restartPolicy: Never diff --git a/services/monitoring/dashboards/atlas-gpu.json b/services/monitoring/dashboards/atlas-gpu.json index 303ec2e..d4ad913 100644 --- a/services/monitoring/dashboards/atlas-gpu.json +++ b/services/monitoring/dashboards/atlas-gpu.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -53,7 +53,25 @@ "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + } + ], + "description": "Use panel links to switch namespace scope." }, { "id": 2, @@ -71,7 +89,7 @@ }, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -186,19 +204,19 @@ "templating": { "list": [ { - "name": "namespace_scope", - "label": "Namespace filter", + "name": "namespace_scope_cpu", + "label": "CPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, { @@ -208,11 +226,79 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": false } ], - "hide": 0, + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_gpu", + "label": "GPU namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_ram", + "label": "RAM namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": false + } + ], + "hide": 2, "multi": false, "includeAll": false, "refresh": 1, diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index 495c622..ff69739 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -142,7 +142,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index d7042ed..ce1b0a3 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -76,7 +76,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"}) or on() vector(0)", "refId": "A" } ], @@ -1086,7 +1086,7 @@ }, "targets": [ { - "expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)", + "expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_cpu}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_cpu}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1119,7 +1119,25 @@ "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=namespace%3D~%22.%2A%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + } + ], + "description": "Use panel links to switch namespace scope." }, { "id": 12, @@ -1137,7 +1155,7 @@ }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1170,7 +1188,25 @@ "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + } + ], + "description": "Use panel links to switch namespace scope." }, { "id": 13, @@ -1188,7 +1224,7 @@ }, "targets": [ { - "expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)", + "expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_ram}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1221,7 +1257,25 @@ "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22.%2A%22", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22", + "targetBlank": false + } + ], + "description": "Use panel links to switch namespace scope." }, { "id": 14, @@ -1793,19 +1847,19 @@ "templating": { "list": [ { - "name": "namespace_scope", - "label": "Namespace filter", + "name": "namespace_scope_cpu", + "label": "CPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, { @@ -1815,11 +1869,79 @@ }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": false } ], - "hide": 0, + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_gpu", + "label": "GPU namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_ram", + "label": "RAM namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": false + } + ], + "hide": 2, "multi": false, "includeAll": false, "refresh": 1, diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json index 4b2a54a..b6d0be0 100644 --- a/services/monitoring/dashboards/atlas-pods.json +++ b/services/monitoring/dashboards/atlas-pods.json @@ -200,7 +200,7 @@ }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-gpu.yaml b/services/monitoring/grafana-dashboard-gpu.yaml index 680cccc..41b4734 100644 --- a/services/monitoring/grafana-dashboard-gpu.yaml +++ b/services/monitoring/grafana-dashboard-gpu.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -62,7 +62,25 @@ data: "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + } + ], + "description": "Use panel links to switch namespace scope." }, { "id": 2, @@ -80,7 +98,7 @@ data: }, "targets": [ { - "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)", + "expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -195,19 +213,19 @@ data: "templating": { "list": [ { - "name": "namespace_scope", - "label": "Namespace filter", + "name": "namespace_scope_cpu", + "label": "CPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, { @@ -217,11 +235,79 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": false } ], - "hide": 0, + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_gpu", + "label": "GPU namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_ram", + "label": "RAM namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": false + } + ], + "hide": 2, "multi": false, "includeAll": false, "refresh": 1, diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index 542daca..854f68a 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -151,7 +151,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index ed7432e..557d120 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -85,7 +85,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"}) or on() vector(0)", "refId": "A" } ], @@ -1095,7 +1095,7 @@ data: }, "targets": [ { - "expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)", + "expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_cpu}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_cpu}[5m])) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1128,7 +1128,25 @@ data: "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=namespace%3D~%22.%2A%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + } + ], + "description": "Use panel links to switch namespace scope." }, { "id": 12, @@ -1146,7 +1164,7 @@ data: }, "targets": [ { - "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0))", + "expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1179,7 +1197,25 @@ data: "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}", + "targetBlank": false + } + ], + "description": "Use panel links to switch namespace scope." }, { "id": 13, @@ -1197,7 +1233,7 @@ data: }, "targets": [ { - "expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)", + "expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_ram}) by (namespace) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1230,7 +1266,25 @@ data: "fields": "", "values": false } - } + }, + "links": [ + { + "title": "Workload namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22", + "targetBlank": false + }, + { + "title": "All namespaces", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22.%2A%22", + "targetBlank": false + }, + { + "title": "Infrastructure namespaces only", + "url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22", + "targetBlank": false + } + ], + "description": "Use panel links to switch namespace scope." }, { "id": 14, @@ -1802,19 +1856,19 @@ data: "templating": { "list": [ { - "name": "namespace_scope", - "label": "Namespace filter", + "name": "namespace_scope_cpu", + "label": "CPU namespace filter", "type": "custom", - "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "current": { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, "options": [ { "text": "workload namespaces only", - "value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": true }, { @@ -1824,11 +1878,79 @@ data: }, { "text": "infrastructure namespaces only", - "value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", "selected": false } ], - "hide": 0, + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_gpu", + "label": "GPU namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": false + } + ], + "hide": 2, + "multi": false, + "includeAll": false, + "refresh": 1, + "sort": 0, + "skipUrlSync": false + }, + { + "name": "namespace_scope_ram", + "label": "RAM namespace filter", + "type": "custom", + "query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "current": { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + "options": [ + { + "text": "workload namespaces only", + "value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": true + }, + { + "text": "all namespaces", + "value": "namespace=~\".*\"", + "selected": false + }, + { + "text": "infrastructure namespaces only", + "value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"", + "selected": false + } + ], + "hide": 2, "multi": false, "includeAll": false, "refresh": 1, diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml index b7c49d5..7d02e22 100644 --- a/services/monitoring/grafana-dashboard-pods.yaml +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -209,7 +209,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})", + "expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})", "refId": "A" } ],