monitoring: fix hottest stats and titan-db scrape

This commit is contained in:
Brad Stein 2025-11-17 19:38:40 -03:00
parent fe8deea9c7
commit a1e731e929
6 changed files with 52 additions and 33 deletions

View File

@ -221,6 +221,7 @@ def stat_panel(
thresholds=None,
text_mode="value",
legend=None,
instant=False,
value_suffix=None,
links=None,
):
@ -259,6 +260,8 @@ def stat_panel(
}
if legend:
panel["targets"][0]["legendFormat"] = legend
if instant:
panel["targets"][0]["instant"] = True
if links:
panel["links"] = links
return panel
@ -339,14 +342,8 @@ def pie_panel(panel_id, title, expr, grid):
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {
"defaults": {
"unit": "percent",
"displayName": "{{namespace}}",
},
"overrides": [],
},
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
"fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
"options": {
"legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie",
@ -382,7 +379,7 @@ def build_overview():
(1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None),
(
2,
"Ready workers",
"Workers ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
WORKER_SUFFIX,
WORKER_TOTAL,
@ -480,6 +477,7 @@ def build_overview():
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
text_mode="value_and_name",
legend="{{node}}",
instant=True,
links=link_to("atlas-nodes"),
)
)
@ -1016,6 +1014,7 @@ def build_network_dashboard():
{"h": 4, "w": 8, "x": 16, "y": 0},
unit="req/s",
legend="{{router}}",
instant=True,
)
)
panels.append(

View File

@ -142,7 +142,8 @@
{
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A",
"legendFormat": "{{router}}"
"legendFormat": "{{router}}",
"instant": true
}
],
"fieldConfig": {

View File

@ -70,7 +70,7 @@
{
"id": 2,
"type": "stat",
"title": "Ready workers",
"title": "Workers ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -440,7 +440,8 @@
{
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A",
"legendFormat": "{{node}}"
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
@ -512,7 +513,8 @@
{
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A",
"legendFormat": "{{node}}"
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
@ -584,7 +586,8 @@
{
"expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
"refId": "A",
"legendFormat": "{{node}}"
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
@ -652,7 +655,8 @@
{
"expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
"refId": "A",
"legendFormat": "{{node}}"
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
@ -719,13 +723,13 @@
"targets": [
{
"expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))",
"refId": "A"
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"displayName": "{{namespace}}"
"unit": "percent"
},
"overrides": []
},
@ -761,13 +765,13 @@
"targets": [
{
"expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))",
"refId": "A"
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"displayName": "{{namespace}}"
"unit": "percent"
},
"overrides": []
},

View File

@ -151,7 +151,8 @@ data:
{
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A",
"legendFormat": "{{router}}"
"legendFormat": "{{router}}",
"instant": true
}
],
"fieldConfig": {

View File

@ -79,7 +79,7 @@ data:
{
"id": 2,
"type": "stat",
"title": "Ready workers",
"title": "Workers ready",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -449,7 +449,8 @@ data:
{
"expr": "topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A",
"legendFormat": "{{node}}"
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
@ -521,7 +522,8 @@ data:
{
"expr": "topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\")))",
"refId": "A",
"legendFormat": "{{node}}"
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
@ -593,7 +595,8 @@ data:
{
"expr": "topk(1, avg by (node) (rate(container_network_receive_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m]) + rate(container_network_transmit_bytes_total{namespace!=\"\" ,pod!=\"\"}[5m])))",
"refId": "A",
"legendFormat": "{{node}}"
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
@ -661,7 +664,8 @@ data:
{
"expr": "topk(1, avg by (node) (rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])))",
"refId": "A",
"legendFormat": "{{node}}"
"legendFormat": "{{node}}",
"instant": true
}
],
"fieldConfig": {
@ -728,13 +732,13 @@ data:
"targets": [
{
"expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\"}[5m])) by (namespace))",
"refId": "A"
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"displayName": "{{namespace}}"
"unit": "percent"
},
"overrides": []
},
@ -770,13 +774,13 @@ data:
"targets": [
{
"expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\"}) by (namespace))",
"refId": "A"
"refId": "A",
"legendFormat": "{{namespace}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"displayName": "{{namespace}}"
"unit": "percent"
},
"overrides": []
},

View File

@ -209,6 +209,16 @@ spec:
- action: keep
source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app_kubernetes_io_part_of]
regex: flux-system;flux
- job_name: "titan-db"
static_configs:
- targets: ["titan-db:9100"]
relabel_configs:
- source_labels: [__address__]
target_label: instance
metric_relabel_configs:
- source_labels: [instance]
target_label: node
replacement: titan-db
---