diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index cf34d6a..3c0d6fa 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -165,6 +165,14 @@ def node_io_expr(scope=""): return scoped_node_expr(base, scope) +def namespace_cpu_share_expr(): + return f"({NAMESPACE_CPU_EXPR}) * on(namespace) group_left() ({NAMESPACE_COMBINED_FILTER})" + + +def namespace_ram_share_expr(): + return f"({NAMESPACE_RAM_EXPR}) * on(namespace) group_left() ({NAMESPACE_COMBINED_FILTER})" + + PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' @@ -199,12 +207,17 @@ STUCK_TABLE_EXPR = ( ) NAMESPACE_CPU_EXPR = ( - 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=""' - ',pod!=""}[5m])) by (namespace))' + 'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)' ) NAMESPACE_RAM_EXPR = ( - 'topk(10, sum(container_memory_working_set_bytes{namespace!=""' - ',pod!=""}) by (namespace))' + 'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)' +) +NAMESPACE_COMBINED_FILTER = ( + 'topk(10, (' + + NAMESPACE_CPU_EXPR + + ") + (" + + NAMESPACE_RAM_EXPR + + ' / 1e9))' ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( @@ -496,7 +509,7 @@ def build_overview(): pie_panel( 11, "Namespace CPU share", - NAMESPACE_CPU_EXPR, + namespace_cpu_share_expr(), {"h": 9, "w": 12, "x": 0, "y": 10}, ) ) @@ -504,7 +517,7 @@ def build_overview(): pie_panel( 12, "Namespace RAM share", - NAMESPACE_RAM_EXPR, + namespace_ram_share_expr(), {"h": 9, "w": 12, "x": 12, "y": 10}, ) ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index bd081a7..7529ae8 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -722,7 +722,7 @@ }, "targets": [ { - "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", + "expr": "(sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", + "expr": "(sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))", "refId": "A", "legendFormat": "{{namespace}}" } diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index fb3d111..ea3523c 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -731,7 +731,7 @@ data: }, "targets": [ { - "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", + "expr": "(sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))", "refId": "A", "legendFormat": "{{namespace}}" } @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", + "expr": "(sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace)) * on(namespace) group_left() (topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9)))", "refId": "A", "legendFormat": "{{namespace}}" }