monitoring: worker/control-plane splits

This commit is contained in:
Brad Stein 2025-11-17 21:48:12 -03:00
parent 76d3dc6ae2
commit b59677615c
3 changed files with 25 additions and 24 deletions

View File

@ -509,11 +509,12 @@ def build_overview():
)
)
worker_filter = f"{WORKER_REGEX}"
panels.append(
timeseries_panel(
13,
"Cluster node CPU",
node_cpu_expr(),
"Worker node CPU",
node_cpu_expr(worker_filter),
{"h": 8, "w": 12, "x": 0, "y": 19},
unit="percent",
legend="{{node}}",
@ -526,8 +527,8 @@ def build_overview():
panels.append(
timeseries_panel(
14,
"Cluster node RAM",
node_mem_expr(),
"Worker node RAM",
node_mem_expr(worker_filter),
{"h": 8, "w": 12, "x": 12, "y": 19},
unit="percent",
legend="{{node}}",
@ -541,8 +542,8 @@ def build_overview():
panels.append(
timeseries_panel(
15,
"Control plane CPU (incl. titan-db)",
node_cpu_expr(CONTROL_ALL_REGEX),
"Control plane CPU",
node_cpu_expr(CONTROL_REGEX),
{"h": 7, "w": 12, "x": 0, "y": 27},
unit="percent",
legend="{{node}}",
@ -553,8 +554,8 @@ def build_overview():
panels.append(
timeseries_panel(
16,
"Control plane RAM (incl. titan-db)",
node_mem_expr(CONTROL_ALL_REGEX),
"Control plane RAM",
node_mem_expr(CONTROL_REGEX),
{"h": 7, "w": 12, "x": 12, "y": 27},
unit="percent",
legend="{{node}}",

View File

@ -793,7 +793,7 @@
{
"id": 13,
"type": "timeseries",
"title": "Cluster node CPU",
"title": "Worker node CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -806,7 +806,7 @@
},
"targets": [
{
"expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -840,7 +840,7 @@
{
"id": 14,
"type": "timeseries",
"title": "Cluster node RAM",
"title": "Worker node RAM",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -853,7 +853,7 @@
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -887,7 +887,7 @@
{
"id": 15,
"type": "timeseries",
"title": "Control plane CPU (incl. titan-db)",
"title": "Control plane CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -900,7 +900,7 @@
},
"targets": [
{
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -924,7 +924,7 @@
{
"id": 16,
"type": "timeseries",
"title": "Control plane RAM (incl. titan-db)",
"title": "Control plane RAM",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -937,7 +937,7 @@
},
"targets": [
{
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}

View File

@ -802,7 +802,7 @@ data:
{
"id": 13,
"type": "timeseries",
"title": "Cluster node CPU",
"title": "Worker node CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -815,7 +815,7 @@ data:
},
"targets": [
{
"expr": "avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -849,7 +849,7 @@ data:
{
"id": 14,
"type": "timeseries",
"title": "Cluster node RAM",
"title": "Worker node RAM",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -862,7 +862,7 @@ data:
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -896,7 +896,7 @@ data:
{
"id": 15,
"type": "timeseries",
"title": "Control plane CPU (incl. titan-db)",
"title": "Control plane CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -909,7 +909,7 @@ data:
},
"targets": [
{
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"expr": "(avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}
@ -933,7 +933,7 @@ data:
{
"id": 16,
"type": "timeseries",
"title": "Control plane RAM (incl. titan-db)",
"title": "Control plane RAM",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -946,7 +946,7 @@ data:
},
"targets": [
{
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c|titan-db\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"expr": "(avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))) * on(node) group_left() label_replace(node_uname_info{nodename=~\"titan-0a|titan-0b|titan-0c\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")",
"refId": "A",
"legendFormat": "{{node}}"
}