diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index 44a0de1..b53c8c7 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -178,7 +178,9 @@ def namespace_ram_share_expr(): def namespace_gpu_share_expr(): - selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" + selected = ( + f"( {NAMESPACE_GPU_RAW} ) + on(namespace) group_left() (0 * {NAMESPACE_COMBINED_FILTER})" + ) total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" return f"100 * ( {selected} ) / {total}" @@ -250,8 +252,8 @@ NAMESPACE_COMBINED_FILTER = ( + " * 10))" ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" -NET_INGRESS_EXPR = 'sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m])) or on() vector(0)' -NET_EGRESS_EXPR = 'sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m])) or on() vector(0)' +NET_INGRESS_EXPR = 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))' +NET_EGRESS_EXPR = 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))' # --------------------------------------------------------------------------- # Panel factories @@ -471,7 +473,7 @@ def build_overview(): thresholds = { "mode": "absolute", "steps": [ - {"color": "red", "value": 0}, + {"color": "red", "value": None}, {"color": "orange", "value": WORKER_TOTAL - 2}, {"color": "yellow", "value": WORKER_TOTAL - 1}, {"color": "green", "value": WORKER_TOTAL}, @@ -481,7 +483,7 @@ def build_overview(): thresholds = { "mode": "absolute", "steps": [ - {"color": "red", "value": 0}, + {"color": "red", "value": None}, {"color": "green", "value": CONTROL_TOTAL}, ], } @@ -489,7 +491,7 @@ def build_overview(): thresholds = { "mode": "absolute", "steps": [ - {"color": "green", "value": 0}, + {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index 27da627..a4daa0c 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -20,7 +20,7 @@ }, "targets": [ { - "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A" } ], @@ -80,7 +80,7 @@ }, "targets": [ { - "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A" } ], diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 11634d9..16e01b3 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -38,7 +38,7 @@ "steps": [ { "color": "red", - "value": 0 + "value": null }, { "color": "orange", @@ -107,7 +107,7 @@ "steps": [ { "color": "red", - "value": 0 + "value": null }, { "color": "green", @@ -168,7 +168,7 @@ "steps": [ { "color": "green", - "value": 0 + "value": null }, { "color": "yellow", @@ -243,7 +243,7 @@ "steps": [ { "color": "green", - "value": 0 + "value": null }, { "color": "yellow", @@ -318,7 +318,7 @@ "steps": [ { "color": "green", - "value": 0 + "value": null }, { "color": "yellow", @@ -764,7 +764,7 @@ }, "targets": [ { - "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", + "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1016,7 +1016,7 @@ }, "targets": [ { - "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A", "legendFormat": "Ingress" } @@ -1060,7 +1060,7 @@ }, "targets": [ { - "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A", "legendFormat": "Egress" } diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 1b70159..029987a 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -29,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A" } ], @@ -89,7 +89,7 @@ data: }, "targets": [ { - "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A" } ], diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index f243cf8..55afe64 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -47,7 +47,7 @@ data: "steps": [ { "color": "red", - "value": 0 + "value": null }, { "color": "orange", @@ -116,7 +116,7 @@ data: "steps": [ { "color": "red", - "value": 0 + "value": null }, { "color": "green", @@ -177,7 +177,7 @@ data: "steps": [ { "color": "green", - "value": 0 + "value": null }, { "color": "yellow", @@ -252,7 +252,7 @@ data: "steps": [ { "color": "green", - "value": 0 + "value": null }, { "color": "yellow", @@ -327,7 +327,7 @@ data: "steps": [ { "color": "green", - "value": 0 + "value": null }, { "color": "yellow", @@ -773,7 +773,7 @@ data: }, "targets": [ { - "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", + "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", "refId": "A", "legendFormat": "{{namespace}}" } @@ -1025,7 +1025,7 @@ data: }, "targets": [ { - "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A", "legendFormat": "Ingress" } @@ -1069,7 +1069,7 @@ data: }, "targets": [ { - "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", + "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))", "refId": "A", "legendFormat": "Egress" }