feature/atlas-monitoring #3

Merged
bstein merged 71 commits from feature/atlas-monitoring into main 2025-12-02 20:52:36 +00:00
5 changed files with 28 additions and 26 deletions
Showing only changes of commit 7009a4f9ff - Show all commits

View File

@ -178,7 +178,9 @@ def namespace_ram_share_expr():
def namespace_gpu_share_expr(): def namespace_gpu_share_expr():
selected = f"( {NAMESPACE_GPU_RAW} ) and on(namespace) ( {NAMESPACE_COMBINED_FILTER} )" selected = (
f"( {NAMESPACE_GPU_RAW} ) + on(namespace) group_left() (0 * {NAMESPACE_COMBINED_FILTER})"
)
total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)" total = f"clamp_min(sum( {NAMESPACE_GPU_RAW} ), 1)"
return f"100 * ( {selected} ) / {total}" return f"100 * ( {selected} ) / {total}"
@ -250,8 +252,8 @@ NAMESPACE_COMBINED_FILTER = (
+ " * 10))" + " * 10))"
) )
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
NET_INGRESS_EXPR = 'sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m])) or on() vector(0)' NET_INGRESS_EXPR = 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
NET_EGRESS_EXPR = 'sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m])) or on() vector(0)' NET_EGRESS_EXPR = 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Panel factories # Panel factories
@ -471,7 +473,7 @@ def build_overview():
thresholds = { thresholds = {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{"color": "red", "value": 0}, {"color": "red", "value": None},
{"color": "orange", "value": WORKER_TOTAL - 2}, {"color": "orange", "value": WORKER_TOTAL - 2},
{"color": "yellow", "value": WORKER_TOTAL - 1}, {"color": "yellow", "value": WORKER_TOTAL - 1},
{"color": "green", "value": WORKER_TOTAL}, {"color": "green", "value": WORKER_TOTAL},
@ -481,7 +483,7 @@ def build_overview():
thresholds = { thresholds = {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{"color": "red", "value": 0}, {"color": "red", "value": None},
{"color": "green", "value": CONTROL_TOTAL}, {"color": "green", "value": CONTROL_TOTAL},
], ],
} }
@ -489,7 +491,7 @@ def build_overview():
thresholds = { thresholds = {
"mode": "absolute", "mode": "absolute",
"steps": [ "steps": [
{"color": "green", "value": 0}, {"color": "green", "value": None},
{"color": "yellow", "value": 1}, {"color": "yellow", "value": 1},
{"color": "orange", "value": 2}, {"color": "orange", "value": 2},
{"color": "red", "value": 3}, {"color": "red", "value": 3},

View File

@ -20,7 +20,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A" "refId": "A"
} }
], ],
@ -80,7 +80,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A" "refId": "A"
} }
], ],

View File

@ -38,7 +38,7 @@
"steps": [ "steps": [
{ {
"color": "red", "color": "red",
"value": 0 "value": null
}, },
{ {
"color": "orange", "color": "orange",
@ -107,7 +107,7 @@
"steps": [ "steps": [
{ {
"color": "red", "color": "red",
"value": 0 "value": null
}, },
{ {
"color": "green", "color": "green",
@ -168,7 +168,7 @@
"steps": [ "steps": [
{ {
"color": "green", "color": "green",
"value": 0 "value": null
}, },
{ {
"color": "yellow", "color": "yellow",
@ -243,7 +243,7 @@
"steps": [ "steps": [
{ {
"color": "green", "color": "green",
"value": 0 "value": null
}, },
{ {
"color": "yellow", "color": "yellow",
@ -318,7 +318,7 @@
"steps": [ "steps": [
{ {
"color": "green", "color": "green",
"value": 0 "value": null
}, },
{ {
"color": "yellow", "color": "yellow",
@ -764,7 +764,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -1016,7 +1016,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A", "refId": "A",
"legendFormat": "Ingress" "legendFormat": "Ingress"
} }
@ -1060,7 +1060,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A", "refId": "A",
"legendFormat": "Egress" "legendFormat": "Egress"
} }

View File

@ -29,7 +29,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A" "refId": "A"
} }
], ],
@ -89,7 +89,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A" "refId": "A"
} }
], ],

View File

@ -47,7 +47,7 @@ data:
"steps": [ "steps": [
{ {
"color": "red", "color": "red",
"value": 0 "value": null
}, },
{ {
"color": "orange", "color": "orange",
@ -116,7 +116,7 @@ data:
"steps": [ "steps": [
{ {
"color": "red", "color": "red",
"value": 0 "value": null
}, },
{ {
"color": "green", "color": "green",
@ -177,7 +177,7 @@ data:
"steps": [ "steps": [
{ {
"color": "green", "color": "green",
"value": 0 "value": null
}, },
{ {
"color": "yellow", "color": "yellow",
@ -252,7 +252,7 @@ data:
"steps": [ "steps": [
{ {
"color": "green", "color": "green",
"value": 0 "value": null
}, },
{ {
"color": "yellow", "color": "yellow",
@ -327,7 +327,7 @@ data:
"steps": [ "steps": [
{ {
"color": "green", "color": "green",
"value": 0 "value": null
}, },
{ {
"color": "yellow", "color": "yellow",
@ -773,7 +773,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) and on(namespace) ( topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10)) ) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)", "expr": "100 * ( ( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ) + on(namespace) group_left() (0 * topk(10, (sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace)) + (sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace) / 1e9) + ((sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) * 10))) ) / clamp_min(sum( (sum((kube_pod_container_resource_requests{namespace!=\"\",resource=\"nvidia.com/gpu\"} or kube_pod_container_resource_limits{namespace!=\"\",resource=\"nvidia.com/gpu\"})) by (namespace)) or on(namespace) group_left() (sum by (namespace) (kube_pod_info{namespace!=\"\"} and on(node) kube_node_info{node=~\"titan-20|titan-21|titan-22|titan-24\"})) ), 1)",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}" "legendFormat": "{{namespace}}"
} }
@ -1025,7 +1025,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A", "refId": "A",
"legendFormat": "Ingress" "legendFormat": "Ingress"
} }
@ -1069,7 +1069,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m])) or on() vector(0)", "expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m]))",
"refId": "A", "refId": "A",
"legendFormat": "Egress" "legendFormat": "Egress"
} }