From a1e731e9299a5ca1908a4953d5d1729e9e6d74e9 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 19:38:40 -0300 Subject: [PATCH] monitoring: fix hottest stats and titan-db scrape --- scripts/render_dashboards.py | 17 ++++++------ .../monitoring/dashboards/atlas-network.json | 3 ++- .../monitoring/dashboards/atlas-overview.json | 26 +++++++++++-------- .../monitoring/grafana-dashboard-network.yaml | 3 ++- .../grafana-dashboard-overview.yaml | 26 +++++++++++-------- services/monitoring/helmrelease.yaml | 10 +++++++ 6 files changed, 52 insertions(+), 33 deletions(-) diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py index b88d5a4..d726015 100644 --- a/scripts/render_dashboards.py +++ b/scripts/render_dashboards.py @@ -221,6 +221,7 @@ def stat_panel( thresholds=None, text_mode="value", legend=None, + instant=False, value_suffix=None, links=None, ): @@ -259,6 +260,8 @@ def stat_panel( } if legend: panel["targets"][0]["legendFormat"] = legend + if instant: + panel["targets"][0]["instant"] = True if links: panel["links"] = links return panel @@ -339,14 +342,8 @@ def pie_panel(panel_id, title, expr, grid): "title": title, "datasource": PROM_DS, "gridPos": grid, - "targets": [{"expr": expr, "refId": "A"}], - "fieldConfig": { - "defaults": { - "unit": "percent", - "displayName": "{{namespace}}", - }, - "overrides": [], - }, + "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], + "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []}, "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", @@ -382,7 +379,7 @@ def build_overview(): (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None), ( 2, - "Ready workers", + "Workers ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', WORKER_SUFFIX, WORKER_TOTAL, @@ -480,6 +477,7 @@ def build_overview(): thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="value_and_name", legend="{{node}}", + instant=True, links=link_to("atlas-nodes"), ) ) @@ -1016,6 +1014,7 @@ def build_network_dashboard(): {"h": 4, "w": 8, "x": 16, "y": 0}, unit="req/s", legend="{{router}}", + instant=True, ) ) panels.append( diff --git a/services/monitoring/dashboards/atlas-network.json b/services/monitoring/dashboards/atlas-network.json index e412045..abd9da7 100644 --- a/services/monitoring/dashboards/atlas-network.json +++ b/services/monitoring/dashboards/atlas-network.json @@ -142,7 +142,8 @@ { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", - "legendFormat": "{{router}}" + "legendFormat": "{{router}}", + "instant": true } ], "fieldConfig": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index ec137f1..1442cf5 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -70,7 +70,7 @@ { "id": 2, "type": "stat", - "title": "Ready workers", + "title": "Workers ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -440,7 +440,8 @@ { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -512,7 +513,8 @@ { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -584,7 +586,8 @@ { "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -652,7 +655,8 @@ { "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -719,13 +723,13 @@ "targets": [ { "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", - "refId": "A" + "refId": "A", + "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { - "unit": "percent", - "displayName": "{{namespace}}" + "unit": "percent" }, "overrides": [] }, @@ -761,13 +765,13 @@ "targets": [ { "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", - "refId": "A" + "refId": "A", + "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { - "unit": "percent", - "displayName": "{{namespace}}" + "unit": "percent" }, "overrides": [] }, diff --git a/services/monitoring/grafana-dashboard-network.yaml b/services/monitoring/grafana-dashboard-network.yaml index 6963e89..8f614ae 100644 --- a/services/monitoring/grafana-dashboard-network.yaml +++ b/services/monitoring/grafana-dashboard-network.yaml @@ -151,7 +151,8 @@ data: { "expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))", "refId": "A", - "legendFormat": "{{router}}" + "legendFormat": "{{router}}", + "instant": true } ], "fieldConfig": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 12555ee..ac95eae 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -79,7 +79,7 @@ data: { "id": 2, "type": "stat", - "title": "Ready workers", + "title": "Workers ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -449,7 +449,8 @@ data: { "expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -521,7 +522,8 @@ data: { "expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -593,7 +595,8 @@ data: { "expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -661,7 +664,8 @@ data: { "expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))", "refId": "A", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -728,13 +732,13 @@ data: "targets": [ { "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))", - "refId": "A" + "refId": "A", + "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { - "unit": "percent", - "displayName": "{{namespace}}" + "unit": "percent" }, "overrides": [] }, @@ -770,13 +774,13 @@ data: "targets": [ { "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))", - "refId": "A" + "refId": "A", + "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { - "unit": "percent", - "displayName": "{{namespace}}" + "unit": "percent" }, "overrides": [] }, diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 58035b6..5a8f1ba 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -209,6 +209,16 @@ spec: - action: keep source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of] regex: flux-system;flux + - job_name: "titan-db" + static_configs: + - targets: ["titan-db:9100"] + relabel_configs: + - source_labels: [__address__] + target_label: instance + metric_relabel_configs: + - source_labels: [instance] + target_label: node + replacement: titan-db ---