feature/atlas-monitoring #3

Merged
bstein merged 71 commits from feature/atlas-monitoring into main 2025-12-02 20:52:36 +00:00
6 changed files with 52 additions and 33 deletions
Showing only changes of commit a1e731e929 - Show all commits

View File

@ -221,6 +221,7 @@ def stat_panel(
thresholds=None, thresholds=None,
text_mode="value", text_mode="value",
legend=None, legend=None,
instant=False,
value_suffix=None, value_suffix=None,
links=None, links=None,
): ):
@ -259,6 +260,8 @@ def stat_panel(
} }
if legend: if legend:
panel["targets"][0]["legendFormat"] = legend panel["targets"][0]["legendFormat"] = legend
if instant:
panel["targets"][0]["instant"] = True
if links: if links:
panel["links"] = links panel["links"] = links
return panel return panel
@ -339,14 +342,8 @@ def pie_panel(panel_id, title, expr, grid):
"title": title, "title": title,
"datasource": PROM_DS, "datasource": PROM_DS,
"gridPos": grid, "gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}], "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
"fieldConfig": { "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
"defaults": {
"unit": "percent",
"displayName": "{{namespace}}",
},
"overrides": [],
},
"options": { "options": {
"legend": {"displayMode": "list", "placement": "right"}, "legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie", "pieType": "pie",
@ -382,7 +379,7 @@ def build_overview():
(1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None), (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None),
( (
2, 2,
"Ready workers", "Workers ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
WORKER_SUFFIX, WORKER_SUFFIX,
WORKER_TOTAL, WORKER_TOTAL,
@ -480,6 +477,7 @@ def build_overview():
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
text_mode="value_and_name", text_mode="value_and_name",
legend="{{node}}", legend="{{node}}",
instant=True,
links=link_to("atlas-nodes"), links=link_to("atlas-nodes"),
) )
) )
@ -1016,6 +1014,7 @@ def build_network_dashboard():
{"h": 4, "w": 8, "x": 16, "y": 0}, {"h": 4, "w": 8, "x": 16, "y": 0},
unit="req/s", unit="req/s",
legend="{{router}}", legend="{{router}}",
instant=True,
) )
) )
panels.append( panels.append(

View File

@ -142,7 +142,8 @@
{ {
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A", "refId": "A",
"legendFormat": "{{router}}" "legendFormat": "{{router}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {

View File

@ -70,7 +70,7 @@
{ {
"id": 2, "id": 2,
"type": "stat", "type": "stat",
"title": "Ready workers", "title": "Workers ready",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -440,7 +440,8 @@
{ {
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -512,7 +513,8 @@
{ {
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -584,7 +586,8 @@
{ {
"expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -652,7 +655,8 @@
{ {
"expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -719,13 +723,13 @@
"targets": [ "targets": [
{ {
"expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))",
"refId": "A" "refId": "A",
"legendFormat": "{{namespace}}"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "percent", "unit": "percent"
"displayName": "{{namespace}}"
}, },
"overrides": [] "overrides": []
}, },
@ -761,13 +765,13 @@
"targets": [ "targets": [
{ {
"expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))",
"refId": "A" "refId": "A",
"legendFormat": "{{namespace}}"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "percent", "unit": "percent"
"displayName": "{{namespace}}"
}, },
"overrides": [] "overrides": []
}, },

View File

@ -151,7 +151,8 @@ data:
{ {
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A", "refId": "A",
"legendFormat": "{{router}}" "legendFormat": "{{router}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {

View File

@ -79,7 +79,7 @@ data:
{ {
"id": 2, "id": 2,
"type": "stat", "type": "stat",
"title": "Ready workers", "title": "Workers ready",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -449,7 +449,8 @@ data:
{ {
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -521,7 +522,8 @@ data:
{ {
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -593,7 +595,8 @@ data:
{ {
"expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -661,7 +664,8 @@ data:
{ {
"expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
"refId": "A", "refId": "A",
"legendFormat": "{{node}}" "legendFormat": "{{node}}",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -728,13 +732,13 @@ data:
"targets": [ "targets": [
{ {
"expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))",
"refId": "A" "refId": "A",
"legendFormat": "{{namespace}}"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "percent", "unit": "percent"
"displayName": "{{namespace}}"
}, },
"overrides": [] "overrides": []
}, },
@ -770,13 +774,13 @@ data:
"targets": [ "targets": [
{ {
"expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))",
"refId": "A" "refId": "A",
"legendFormat": "{{namespace}}"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "percent", "unit": "percent"
"displayName": "{{namespace}}"
}, },
"overrides": [] "overrides": []
}, },

View File

@ -209,6 +209,16 @@ spec:
- action: keep - action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of] source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
regex: flux-system;flux regex: flux-system;flux
- job_name: "titan-db"
static_configs:
- targets: ["titan-db:9100"]
relabel_configs:
- source_labels: [__address__]
target_label: instance
metric_relabel_configs:
- source_labels: [instance]
target_label: node
replacement: titan-db
--- ---