monitoring: per-panel namespace share filters

This commit is contained in:
Brad Stein 2026-01-01 14:44:33 -03:00
parent 7c31d25c24
commit 5093f77c0a
11 changed files with 568 additions and 105 deletions

View File

@ -9,6 +9,7 @@ Usage:
import argparse
import json
import textwrap
import urllib.parse
from pathlib import Path
# ---------------------------------------------------------------------------
@ -80,7 +81,7 @@ CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
WORKER_TOTAL = len(WORKER_NODES)
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
CP_ALLOWED_NS = "(^kube.*|.*-system$|^traefik$|^monitoring$)"
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
CONTROL_WORKLOADS_EXPR = (
@ -170,22 +171,43 @@ def node_io_expr(scope=""):
return scoped_node_expr(base, scope)
def namespace_selector(scope_var):
return f'namespace!="",pod!="",container!="",{scope_var}'
def namespace_gpu_selector(scope_var):
return f'namespace!="",pod!="",{scope_var}'
def namespace_cpu_raw(scope_var):
return f"sum(rate(container_cpu_usage_seconds_total{{{namespace_selector(scope_var)}}}[5m])) by (namespace)"
def namespace_ram_raw(scope_var):
return f"sum(container_memory_working_set_bytes{{{namespace_selector(scope_var)}}}) by (namespace)"
def namespace_gpu_usage_instant(scope_var):
return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)"
def namespace_share_expr(resource_expr):
total = f"clamp_min(sum( {resource_expr} ), 1)"
return f"100 * ( {resource_expr} ) / {total}"
def namespace_cpu_share_expr():
return namespace_share_expr(NAMESPACE_CPU_RAW)
def namespace_cpu_share_expr(scope_var):
return namespace_share_expr(namespace_cpu_raw(scope_var))
def namespace_ram_share_expr():
return namespace_share_expr(NAMESPACE_RAM_RAW)
def namespace_ram_share_expr(scope_var):
return namespace_share_expr(namespace_ram_raw(scope_var))
def namespace_gpu_share_expr():
total = f"(sum({NAMESPACE_GPU_USAGE_INSTANT}) or on() vector(0))"
share = f"100 * ({NAMESPACE_GPU_USAGE_INSTANT}) / clamp_min({total}, 1)"
def namespace_gpu_share_expr(scope_var):
usage = namespace_gpu_usage_instant(scope_var)
total = f"(sum({usage}) or on() vector(0))"
share = f"100 * ({usage}) / clamp_min({total}, 1)"
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)"
return f"({share}) or ({idle})"
@ -272,20 +294,12 @@ STUCK_TABLE_EXPR = (
")"
)
NAMESPACE_SCOPE_WORKLOAD = 'namespace!~"(^kube.*|.*-system$|^traefik$)"'
NAMESPACE_SCOPE_WORKLOAD = 'namespace!~"(^kube.*|.*-system$|^traefik$|^monitoring$)"'
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
NAMESPACE_SCOPE_INFRA = 'namespace=~"(^kube.*|.*-system$|^traefik$)"'
NAMESPACE_SCOPE_VAR = "$namespace_scope"
NAMESPACE_SELECTOR = f'namespace!="",pod!="",container!="",{NAMESPACE_SCOPE_VAR}'
NAMESPACE_GPU_SELECTOR = f'namespace!="",pod!="",{NAMESPACE_SCOPE_VAR}'
NAMESPACE_CPU_RAW = (
f'sum(rate(container_cpu_usage_seconds_total{{{NAMESPACE_SELECTOR}}}[5m])) by (namespace)'
)
NAMESPACE_RAM_RAW = f'sum(container_memory_working_set_bytes{{{NAMESPACE_SELECTOR}}}) by (namespace)'
NAMESPACE_SCOPE_INFRA = 'namespace=~"(^kube.*|.*-system$|^traefik$|^monitoring$)"'
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
GPU_NODE_REGEX = "|".join(GPU_NODES)
NAMESPACE_GPU_USAGE_INSTANT = f'sum(DCGM_FI_DEV_GPU_UTIL{{{NAMESPACE_GPU_SELECTOR}}}) by (namespace)'
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
TRAEFIK_NET_INGRESS = (
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
@ -536,9 +550,9 @@ def table_panel(
return panel
def pie_panel(panel_id, title, expr, grid):
def pie_panel(panel_id, title, expr, grid, *, links=None, description=None):
"""Return a pie chart panel with readable namespace labels."""
return {
panel = {
"id": panel_id,
"type": "piechart",
"title": title,
@ -562,9 +576,14 @@ def pie_panel(panel_id, title, expr, grid):
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
},
}
if links:
panel["links"] = links
if description:
panel["description"] = description
return panel
def namespace_scope_variable():
def namespace_scope_variable(var_name, label):
options = [
{
"text": "workload namespaces only",
@ -587,13 +606,13 @@ def namespace_scope_variable():
+ NAMESPACE_SCOPE_INFRA
)
return {
"name": "namespace_scope",
"label": "Namespace filter",
"name": var_name,
"label": label,
"type": "custom",
"query": query,
"current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True},
"options": options,
"hide": 0,
"hide": 2,
"multi": False,
"includeAll": False,
"refresh": 1,
@ -602,6 +621,28 @@ def namespace_scope_variable():
}
def namespace_scope_links(var_name):
def with_value(value):
encoded = urllib.parse.quote(value, safe="")
params = []
for other in NAMESPACE_SCOPE_VARS:
if other == var_name:
params.append(f"var-{other}={encoded}")
else:
params.append(f"var-{other}=${{{other}}}")
return "?" + "&".join(params)
return [
{"title": "Workload namespaces only", "url": with_value(NAMESPACE_SCOPE_WORKLOAD), "targetBlank": False},
{"title": "All namespaces", "url": with_value(NAMESPACE_SCOPE_ALL), "targetBlank": False},
{
"title": "Infrastructure namespaces only",
"url": with_value(NAMESPACE_SCOPE_INFRA),
"targetBlank": False,
},
]
def bargauge_panel(
panel_id,
title,
@ -890,28 +931,38 @@ def build_overview():
)
)
cpu_scope = "$namespace_scope_cpu"
gpu_scope = "$namespace_scope_gpu"
ram_scope = "$namespace_scope_ram"
panels.append(
pie_panel(
11,
"Namespace CPU Share",
namespace_cpu_share_expr(),
namespace_cpu_share_expr(cpu_scope),
{"h": 9, "w": 8, "x": 0, "y": 16},
links=namespace_scope_links("namespace_scope_cpu"),
description="Use panel links to switch namespace scope.",
)
)
panels.append(
pie_panel(
12,
"Namespace GPU Share",
namespace_gpu_share_expr(),
namespace_gpu_share_expr(gpu_scope),
{"h": 9, "w": 8, "x": 8, "y": 16},
links=namespace_scope_links("namespace_scope_gpu"),
description="Use panel links to switch namespace scope.",
)
)
panels.append(
pie_panel(
13,
"Namespace RAM Share",
namespace_ram_share_expr(),
namespace_ram_share_expr(ram_scope),
{"h": 9, "w": 8, "x": 16, "y": 16},
links=namespace_scope_links("namespace_scope_ram"),
description="Use panel links to switch namespace scope.",
)
)
@ -1077,7 +1128,13 @@ def build_overview():
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "overview"],
"templating": {"list": [namespace_scope_variable()]},
"templating": {
"list": [
namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
]
},
"time": {"from": "now-1h", "to": "now"},
"refresh": "1m",
"links": [],
@ -1718,19 +1775,22 @@ def build_network_dashboard():
def build_gpu_dashboard():
panels = []
gpu_scope = "$namespace_scope_gpu"
panels.append(
pie_panel(
1,
"Namespace GPU Share",
namespace_gpu_share_expr(),
namespace_gpu_share_expr(gpu_scope),
{"h": 8, "w": 12, "x": 0, "y": 0},
links=namespace_scope_links("namespace_scope_gpu"),
description="Use panel links to switch namespace scope.",
)
)
panels.append(
timeseries_panel(
2,
"GPU Util by Namespace",
NAMESPACE_GPU_USAGE_INSTANT,
namespace_gpu_usage_instant(gpu_scope),
{"h": 8, "w": 12, "x": 12, "y": 0},
unit="percent",
legend="{{namespace}}",
@ -1771,7 +1831,13 @@ def build_gpu_dashboard():
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "gpu"],
"templating": {"list": [namespace_scope_variable()]},
"templating": {
"list": [
namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
]
},
}

View File

@ -7,6 +7,8 @@ metadata:
spec:
replicas: 1
revisionHistoryLimit: 2
strategy:
type: Recreate
selector:
matchLabels:
app: ollama

View File

@ -200,24 +200,3 @@ spec:
port:
number: 80
pathType: Prefix
---
# Source: element-web/templates/tests/test-connection.yaml
apiVersion: v1
kind: Pod
metadata:
name: "othrys-element-element-web-test-connection"
labels:
helm.sh/chart: element-web-1.4.26
app.kubernetes.io/name: element-web
app.kubernetes.io/instance: othrys-element
app.kubernetes.io/version: "1.12.6"
app.kubernetes.io/managed-by: Helm
annotations:
"helm.sh/hook": test-success
spec:
containers:
- name: wget
image: busybox
command: ['wget']
args: ['othrys-element-element-web:80']
restartPolicy: Never

View File

@ -20,7 +20,7 @@
},
"targets": [
{
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0))",
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -53,7 +53,25 @@
"fields": "",
"values": false
}
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
}
],
"description": "Use panel links to switch namespace scope."
},
{
"id": 2,
@ -71,7 +89,7 @@
},
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)",
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -186,19 +204,19 @@
"templating": {
"list": [
{
"name": "namespace_scope",
"label": "Namespace filter",
"name": "namespace_scope_cpu",
"label": "CPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
{
@ -208,11 +226,79 @@
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": false
}
],
"hide": 0,
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_gpu",
"label": "GPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_ram",
"label": "RAM namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,

View File

@ -142,7 +142,7 @@
},
"targets": [
{
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})",
"refId": "A"
}
],

View File

@ -76,7 +76,7 @@
},
"targets": [
{
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)",
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"}) or on() vector(0)",
"refId": "A"
}
],
@ -1086,7 +1086,7 @@
},
"targets": [
{
"expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)",
"expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_cpu}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_cpu}[5m])) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1119,7 +1119,25 @@
"fields": "",
"values": false
}
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=namespace%3D~%22.%2A%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
}
],
"description": "Use panel links to switch namespace scope."
},
{
"id": 12,
@ -1137,7 +1155,7 @@
},
"targets": [
{
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0))",
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1170,7 +1188,25 @@
"fields": "",
"values": false
}
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
}
],
"description": "Use panel links to switch namespace scope."
},
{
"id": 13,
@ -1188,7 +1224,7 @@
},
"targets": [
{
"expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)",
"expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_ram}) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1221,7 +1257,25 @@
"fields": "",
"values": false
}
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22.%2A%22",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22",
"targetBlank": false
}
],
"description": "Use panel links to switch namespace scope."
},
{
"id": 14,
@ -1793,19 +1847,19 @@
"templating": {
"list": [
{
"name": "namespace_scope",
"label": "Namespace filter",
"name": "namespace_scope_cpu",
"label": "CPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
{
@ -1815,11 +1869,79 @@
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": false
}
],
"hide": 0,
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_gpu",
"label": "GPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_ram",
"label": "RAM namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,

View File

@ -200,7 +200,7 @@
},
"targets": [
{
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})",
"refId": "A"
}
],

View File

@ -29,7 +29,7 @@ data:
},
"targets": [
{
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0))",
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -62,7 +62,25 @@ data:
"fields": "",
"values": false
}
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
}
],
"description": "Use panel links to switch namespace scope."
},
{
"id": 2,
@ -80,7 +98,7 @@ data:
},
"targets": [
{
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)",
"expr": "sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -195,19 +213,19 @@ data:
"templating": {
"list": [
{
"name": "namespace_scope",
"label": "Namespace filter",
"name": "namespace_scope_cpu",
"label": "CPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
{
@ -217,11 +235,79 @@ data:
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": false
}
],
"hide": 0,
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_gpu",
"label": "GPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_ram",
"label": "RAM namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,

View File

@ -151,7 +151,7 @@ data:
},
"targets": [
{
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})",
"refId": "A"
}
],

View File

@ -85,7 +85,7 @@ data:
},
"targets": [
{
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"}) or on() vector(0)",
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"}) or on() vector(0)",
"refId": "A"
}
],
@ -1095,7 +1095,7 @@ data:
},
"targets": [
{
"expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}[5m])) by (namespace) ), 1)",
"expr": "100 * ( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_cpu}[5m])) by (namespace) ) / clamp_min(sum( sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_cpu}[5m])) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1128,7 +1128,25 @@ data:
"fields": "",
"values": false
}
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=namespace%3D~%22.%2A%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
}
],
"description": "Use panel links to switch namespace scope."
},
{
"id": 12,
@ -1146,7 +1164,7 @@ data:
},
"targets": [
{
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope}) by (namespace)) or on() vector(0)) == 0))",
"expr": "(100 * (sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) / clamp_min((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)), 1)) or (label_replace(vector(100), \"namespace\", \"idle\", \"\", \"\") and on() ((sum(sum(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\",$namespace_scope_gpu}) by (namespace)) or on() vector(0)) == 0))",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1179,7 +1197,25 @@ data:
"fields": "",
"values": false
}
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22.%2A%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22&var-namespace_scope_ram=${namespace_scope_ram}",
"targetBlank": false
}
],
"description": "Use panel links to switch namespace scope."
},
{
"id": 13,
@ -1197,7 +1233,7 @@ data:
},
"targets": [
{
"expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope}) by (namespace) ), 1)",
"expr": "100 * ( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_ram}) by (namespace) ) / clamp_min(sum( sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\",$namespace_scope_ram}) by (namespace) ), 1)",
"refId": "A",
"legendFormat": "{{namespace}}"
}
@ -1230,7 +1266,25 @@ data:
"fields": "",
"values": false
}
}
},
"links": [
{
"title": "Workload namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%21~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22",
"targetBlank": false
},
{
"title": "All namespaces",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22.%2A%22",
"targetBlank": false
},
{
"title": "Infrastructure namespaces only",
"url": "?var-namespace_scope_cpu=${namespace_scope_cpu}&var-namespace_scope_gpu=${namespace_scope_gpu}&var-namespace_scope_ram=namespace%3D~%22%28%5Ekube.%2A%7C.%2A-system%24%7C%5Etraefik%24%7C%5Emonitoring%24%29%22",
"targetBlank": false
}
],
"description": "Use panel links to switch namespace scope."
},
{
"id": 14,
@ -1802,19 +1856,19 @@ data:
"templating": {
"list": [
{
"name": "namespace_scope",
"label": "Namespace filter",
"name": "namespace_scope_cpu",
"label": "CPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$)\"",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
{
@ -1824,11 +1878,79 @@ data:
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$)\"",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": false
}
],
"hide": 0,
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_gpu",
"label": "GPU namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,
"sort": 0,
"skipUrlSync": false
},
{
"name": "namespace_scope_ram",
"label": "RAM namespace filter",
"type": "custom",
"query": "workload namespaces only : namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\",all namespaces : namespace=~\".*\",infrastructure namespaces only : namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"current": {
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
"options": [
{
"text": "workload namespaces only",
"value": "namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": true
},
{
"text": "all namespaces",
"value": "namespace=~\".*\"",
"selected": false
},
{
"text": "infrastructure namespaces only",
"value": "namespace=~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"",
"selected": false
}
],
"hide": 2,
"multi": false,
"includeAll": false,
"refresh": 1,

View File

@ -209,7 +209,7 @@ data:
},
"targets": [
{
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system\"})",
"expr": "sum(kube_pod_info{node=~\"titan-0a|titan-0b|titan-0c\",namespace!~\"(^kube.*|.*-system$|^traefik$|^monitoring$)\"})",
"refId": "A"
}
],