monitoring: enrich dashboards

This commit is contained in:
Brad Stein 2025-11-16 00:55:28 -03:00
parent 0b1437b77c
commit 52b5f7dfdf
2 changed files with 557 additions and 128 deletions

View File

@ -38,14 +38,14 @@ data:
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 6, "w": 4,
"x": 0, "x": 0,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
{ {
"expr": "sum(kube_pod_status_phase{phase=\"Running\"})", "expr": "sum(kube_pod_status_phase{phase=\\\"Running\\\"})",
"refId": "A" "refId": "A"
} }
], ],
@ -82,26 +82,27 @@ data:
], ],
"fields": "", "fields": "",
"values": false "values": false
} },
"textMode": "value"
} }
}, },
{ {
"id": 2, "id": 2,
"type": "stat", "type": "stat",
"title": "Ready node percentage", "title": "Ready nodes",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 6, "w": 4,
"x": 6, "x": 4,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
{ {
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100", "expr": "sum(kube_node_status_condition{condition=\\\"Ready\\\",status=\\\"true\\\"})",
"refId": "A" "refId": "A"
} }
], ],
@ -112,23 +113,19 @@ data:
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
"mode": "percentage", "mode": "absolute",
"steps": [ "steps": [
{ {
"color": "red", "color": "rgba(115, 115, 115, 1)",
"value": null "value": null
}, },
{
"color": "orange",
"value": 90
},
{ {
"color": "green", "color": "green",
"value": 98 "value": 1
} }
] ]
}, },
"unit": "percent" "unit": "none"
}, },
"overrides": [] "overrides": []
}, },
@ -142,26 +139,27 @@ data:
], ],
"fields": "", "fields": "",
"values": false "values": false
} },
"textMode": "value"
} }
}, },
{ {
"id": 3, "id": 3,
"type": "stat", "type": "stat",
"title": "Cluster CPU saturation", "title": "Cluster nodes",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 6, "w": 4,
"x": 12, "x": 8,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
{ {
"expr": "avg((1 - rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", "expr": "count(kube_node_info)",
"refId": "A" "refId": "A"
} }
], ],
@ -172,23 +170,19 @@ data:
}, },
"mappings": [], "mappings": [],
"thresholds": { "thresholds": {
"mode": "percentage", "mode": "absolute",
"steps": [ "steps": [
{ {
"color": "green", "color": "rgba(115, 115, 115, 1)",
"value": null "value": null
}, },
{ {
"color": "yellow", "color": "green",
"value": 65 "value": 1
},
{
"color": "red",
"value": 85
} }
] ]
}, },
"unit": "percent" "unit": "none"
}, },
"overrides": [] "overrides": []
}, },
@ -202,26 +196,27 @@ data:
], ],
"fields": "", "fields": "",
"values": false "values": false
} },
"textMode": "value"
} }
}, },
{ {
"id": 4, "id": 4,
"type": "stat", "type": "stat",
"title": "Cluster memory usage", "title": "Hottest node CPU",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 6, "w": 4,
"x": 18, "x": 12,
"y": 0 "y": 0
}, },
"targets": [ "targets": [
{ {
"expr": "100 - (sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes) * 100)", "expr": "topk(1, avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\\\"idle\\\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)))",
"refId": "A" "refId": "A"
} }
], ],
@ -262,11 +257,134 @@ data:
], ],
"fields": "", "fields": "",
"values": false "values": false
} },
"textMode": "value_and_name"
} }
}, },
{ {
"id": 5, "id": 5,
"type": "stat",
"title": "Hottest node memory",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 16,
"y": 0
},
"targets": [
{
"expr": "topk(1, avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 75
},
{
"color": "red",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value_and_name"
}
},
{
"id": 6,
"type": "stat",
"title": "Failed pods (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 4,
"x": 20,
"y": 0
},
"targets": [
{
"expr": "sum(increase(kube_pod_status_phase{phase=\\\"Failed\\\"}[24h]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 7,
"type": "piechart", "type": "piechart",
"title": "Namespace CPU share", "title": "Namespace CPU share",
"datasource": { "datasource": {
@ -277,11 +395,11 @@ data:
"h": 9, "h": 9,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 6 "y": 5
}, },
"targets": [ "targets": [
{ {
"expr": "topk(8, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\\\"\\\\\",pod!=\\\"\\\\\",container!=\\\"\\\\\"}[5m])) by (namespace))",
"refId": "A" "refId": "A"
} }
], ],
@ -307,7 +425,7 @@ data:
} }
}, },
{ {
"id": 6, "id": 8,
"type": "piechart", "type": "piechart",
"title": "Namespace memory share", "title": "Namespace memory share",
"datasource": { "datasource": {
@ -318,11 +436,11 @@ data:
"h": 9, "h": 9,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 6 "y": 5
}, },
"targets": [ "targets": [
{ {
"expr": "topk(8, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\\\"\\\\\",pod!=\\\"\\\\\",container!=\\\"\\\\\"}) by (namespace))",
"refId": "A" "refId": "A"
} }
], ],
@ -348,7 +466,7 @@ data:
} }
}, },
{ {
"id": 7, "id": 9,
"type": "timeseries", "type": "timeseries",
"title": "Node CPU usage (per node)", "title": "Node CPU usage (per node)",
"datasource": { "datasource": {
@ -359,13 +477,13 @@ data:
"h": 8, "h": 8,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 15 "y": 14
}, },
"targets": [ "targets": [
{ {
"expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100", "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\\\"idle\\\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
"refId": "A", "refId": "A",
"legendFormat": "{{instance}}" "legendFormat": "{{node}}"
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -385,7 +503,7 @@ data:
} }
}, },
{ {
"id": 8, "id": 10,
"type": "timeseries", "type": "timeseries",
"title": "Node memory usage (per node)", "title": "Node memory usage (per node)",
"datasource": { "datasource": {
@ -396,13 +514,13 @@ data:
"h": 8, "h": 8,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 15 "y": 14
}, },
"targets": [ "targets": [
{ {
"expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)", "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
"refId": "A", "refId": "A",
"legendFormat": "{{instance}}" "legendFormat": "{{node}}"
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -422,7 +540,7 @@ data:
} }
}, },
{ {
"id": 9, "id": 11,
"type": "table", "type": "table",
"title": "Key service availability", "title": "Key service availability",
"datasource": { "datasource": {
@ -430,46 +548,23 @@ data:
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 8, "h": 7,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 23 "y": 22
}, },
"targets": [ "targets": [
{ {
"expr": "max by (service) (up{service=~\"traefik|gitea|vault|victoria-metrics-single|grafana|alertmanager\"})", "expr": "label_replace((sum by (deployment,namespace) (kube_deployment_status_replicas_available{deployment=~\\\"traefik|gitea|grafana\\\",namespace=~\\\"traefik|gitea|monitoring\\\"}) / sum by (deployment,namespace) (kube_deployment_spec_replicas{deployment=~\\\"traefik|gitea|grafana\\\",namespace=~\\\"traefik|gitea|monitoring\\\"})), \\\"service\\\", \\\"$1\\\", \\\"deployment\\\", \\\"(.*)\\\") or label_replace((sum by (statefulset,namespace) (kube_statefulset_status_replicas_ready{statefulset=~\\\"vault|alertmanager|victoria-metrics-single-server\\\",namespace=~\\\"vault|monitoring\\\"}) / sum by (statefulset,namespace) (kube_statefulset_status_replicas{statefulset=~\\\"vault|alertmanager|victoria-metrics-single-server\\\",namespace=~\\\"vault|monitoring\\\"})), \\\"service\\\", \\\"$1\\\", \\\"statefulset\\\", \\\"(.*)\\\")",
"refId": "A" "refId": "A"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"mappings": [ "custom": {
{ "align": "auto"
"id": 0, },
"type": 1, "unit": "percent"
"value": "0",
"text": "Down"
},
{
"id": 1,
"type": 1,
"value": "1",
"text": "Up"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "green",
"value": 1
}
]
}
}, },
"overrides": [] "overrides": []
}, },
@ -478,22 +573,22 @@ data:
} }
}, },
{ {
"id": 10, "id": 12,
"type": "table", "type": "table",
"title": "Failed pods (24h trend)", "title": "Failed pods by namespace (24h)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
}, },
"gridPos": { "gridPos": {
"h": 8, "h": 7,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 23 "y": 22
}, },
"targets": [ "targets": [
{ {
"expr": "topk(10, sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h])) by (namespace))", "expr": "topk(10, sum by (namespace) (increase(kube_pod_status_phase{phase=\\\"Failed\\\"}[24h])))",
"refId": "A" "refId": "A"
} }
], ],
@ -508,9 +603,9 @@ data:
} }
}, },
{ {
"id": 11, "id": 13,
"type": "timeseries", "type": "timeseries",
"title": "Cluster network throughput", "title": "Root filesystem usage per node",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -519,23 +614,18 @@ data:
"h": 8, "h": 8,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 31 "y": 29
}, },
"targets": [ "targets": [
{ {
"expr": "sum(rate(container_network_receive_bytes_total{namespace!=\"\"}[5m]))", "expr": "avg by (node) ((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\\\"/\\\",fstype!~\\\"tmpfs|overlay\\\"} / node_filesystem_size_bytes{mountpoint=\\\"/\\\",fstype!~\\\"tmpfs|overlay\\\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)",
"refId": "A", "refId": "A",
"legendFormat": "Receive" "legendFormat": "{{node}}"
},
{
"expr": "sum(rate(container_network_transmit_bytes_total{namespace!=\"\"}[5m]))",
"refId": "B",
"legendFormat": "Transmit"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "Bps" "unit": "percent"
}, },
"overrides": [] "overrides": []
}, },
@ -550,9 +640,9 @@ data:
} }
}, },
{ {
"id": 12, "id": 14,
"type": "timeseries", "type": "bargauge",
"title": "Storage usage across nodes", "title": "Nodes closest to full root disks",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -561,40 +651,377 @@ data:
"h": 8, "h": 8,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 31 "y": 29
}, },
"targets": [ "targets": [
{ {
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}) * 100)", "expr": "topk(8, avg by (node) ((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\\\"/\\\",fstype!~\\\"tmpfs|overlay\\\"} / node_filesystem_size_bytes{mountpoint=\\\"/\\\",fstype!~\\\"tmpfs|overlay\\\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
"refId": "A" "refId": "A"
} }
], ],
"fieldConfig": { "fieldConfig": {
"defaults": { "defaults": {
"unit": "percent",
"min": 0,
"max": 100
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 15,
"type": "stat",
"title": "Astreae usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 37
},
"targets": [
{
"expr": "(sum(longhorn_disk_usage_bytes{disk=~\\\"astreae-.*\\\"}) / sum(longhorn_disk_capacity_bytes{disk=~\\\"astreae-.*\\\"})) * 100",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 85
}
]
},
"unit": "percent" "unit": "percent"
}, },
"overrides": [] "overrides": []
}, },
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 16,
"type": "stat",
"title": "Asteria usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 6,
"x": 6,
"y": 37
},
"targets": [
{
"expr": "(sum(longhorn_disk_usage_bytes{disk=~\\\"asteria-.*\\\"}) / sum(longhorn_disk_capacity_bytes{disk=~\\\"asteria-.*\\\"})) * 100",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 70
},
{
"color": "red",
"value": 85
}
]
},
"unit": "percent"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 17,
"type": "stat",
"title": "Astreae schedulable",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 6,
"x": 12,
"y": 37
},
"targets": [
{
"expr": "(sum(longhorn_disk_capacity_bytes{disk=~\\\"astreae-.*\\\"}) - sum(longhorn_disk_usage_bytes{disk=~\\\"astreae-.*\\\"}) - sum(longhorn_disk_reservation_bytes{disk=~\\\"astreae-.*\\\"}))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "bytesSI"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 18,
"type": "stat",
"title": "Asteria schedulable",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 6,
"x": 18,
"y": 37
},
"targets": [
{
"expr": "(sum(longhorn_disk_capacity_bytes{disk=~\\\"asteria-.*\\\"}) - sum(longhorn_disk_usage_bytes{disk=~\\\"asteria-.*\\\"}) - sum(longhorn_disk_reservation_bytes{disk=~\\\"asteria-.*\\\"}))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "bytesSI"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 19,
"type": "piechart",
"title": "Longhorn node readiness",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 44
},
"targets": [
{
"expr": "sum(longhorn_node_status{condition=\\\"ready\\\"})",
"refId": "A",
"legendFormat": "Ready"
},
{
"expr": "(longhorn_node_count_total - sum(longhorn_node_status{condition=\\\"ready\\\"}))",
"refId": "B",
"legendFormat": "Offline"
}
],
"fieldConfig": {
"defaults": {
"unit": "none"
},
"overrides": []
},
"options": { "options": {
"legend": { "legend": {
"displayMode": "list", "displayMode": "list",
"placement": "bottom" "placement": "right"
},
"pieType": "donut",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
} }
} }
}, },
{ {
"id": 13, "id": 20,
"type": "piechart",
"title": "Longhorn disk schedulability",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 44
},
"targets": [
{
"expr": "sum(sum by (node,disk) (longhorn_disk_status{condition=\\\"schedulable\\\"}))",
"refId": "A",
"legendFormat": "Schedulable"
},
{
"expr": "(count(sum by (node,disk) (longhorn_disk_status{condition=\\\"ready\\\"})) - sum(sum by (node,disk) (longhorn_disk_status{condition=\\\"schedulable\\\"})))",
"refId": "B",
"legendFormat": "Blocked"
}
],
"fieldConfig": {
"defaults": {
"unit": "none"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "list",
"placement": "right"
},
"pieType": "donut",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 21,
"type": "text", "type": "text",
"title": "About this dashboard", "title": "About this dashboard",
"gridPos": { "gridPos": {
"h": 6, "h": 5,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 39 "y": 51
}, },
"options": { "options": {
"content": "### Atlas at a glance\n- Raspberry Pi + Jetson hybrid cluster with Flux-managed GitOps\n- Metrics powered by VictoriaMetrics, visualized by Grafana\n- Login for SRE mode with pod-level drilldowns, alert routes, and storage health", "mode": "markdown",
"mode": "markdown" "content": "### Atlas at a glance\\n- Flux-managed Pi + Jetson cluster with 20+ active nodes\\n- Longhorn tiers: Astreae (3x replicas) & Asteria (2x replicas) tracked separately\\n- Login for SRE mode with alert routing, Longhorn drilldowns, and workload burn rates"
} }
} }
], ],
@ -614,6 +1041,5 @@ data:
}, },
"title": "Atlas Public Overview", "title": "Atlas Public Overview",
"uid": "atlas-public", "uid": "atlas-public",
"version": 3 "version": 5
} }

View File

@ -45,7 +45,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / sum(kube_node_info) * 100", "expr": "sum(kube_node_status_condition{condition=\\\"Ready\\\",status=\\\"true\\\"}) / count(kube_node_info) * 100",
"refId": "A" "refId": "A"
} }
], ],
@ -86,7 +86,8 @@ data:
], ],
"fields": "", "fields": "",
"values": false "values": false
} },
"textMode": "value"
} }
}, },
{ {
@ -105,7 +106,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(kube_pod_status_phase{phase=\"Pending\"})", "expr": "sum(kube_pod_status_phase{phase=\\\"Pending\\\"})",
"refId": "A" "refId": "A"
} }
], ],
@ -146,7 +147,8 @@ data:
], ],
"fields": "", "fields": "",
"values": false "values": false
} },
"textMode": "value"
} }
}, },
{ {
@ -206,7 +208,8 @@ data:
], ],
"fields": "", "fields": "",
"values": false "values": false
} },
"textMode": "value"
} }
}, },
{ {
@ -225,7 +228,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(ALERTS{alertstate=\"firing\"})", "expr": "sum(ALERTS{alertstate=\\\"firing\\\"})",
"refId": "A" "refId": "A"
} }
], ],
@ -266,7 +269,8 @@ data:
], ],
"fields": "", "fields": "",
"values": false "values": false
} },
"textMode": "value"
} }
}, },
{ {
@ -285,9 +289,9 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "avg(rate(node_cpu_seconds_total{mode!=\"idle\"}[5m])) by (instance) * 100", "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\\\"idle\\\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
"refId": "A", "refId": "A",
"legendFormat": "{{instance}}" "legendFormat": "{{node}}"
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -322,9 +326,9 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "avg((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100) by (instance)", "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
"refId": "A", "refId": "A",
"legendFormat": "{{instance}}" "legendFormat": "{{node}}"
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -359,7 +363,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "topk(5, sum(rate(container_cpu_usage_seconds_total{pod!=\"\",container!=\"\"}[5m])) by (namespace,pod))", "expr": "topk(5, sum(rate(container_cpu_usage_seconds_total{pod!=\\\"\\\\\",container!=\\\"\\\\\"}[5m])) by (namespace,pod))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}/{{pod}}" "legendFormat": "{{namespace}}/{{pod}}"
} }
@ -396,7 +400,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "topk(5, sum(container_memory_working_set_bytes{pod!=\"\",container!=\"\"}) by (namespace,pod))", "expr": "topk(5, sum(container_memory_working_set_bytes{pod!=\\\"\\\\\",container!=\\\"\\\\\"}) by (namespace,pod))",
"refId": "A", "refId": "A",
"legendFormat": "{{namespace}}/{{pod}}" "legendFormat": "{{namespace}}/{{pod}}"
} }
@ -433,7 +437,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "topk(8, sum(increase(kube_pod_container_status_restarts_total{namespace!=\"\"}[6h])) by (namespace))", "expr": "topk(8, sum(increase(kube_pod_container_status_restarts_total{namespace!=\\\"\\\\\"}[6h])) by (namespace))",
"refId": "A" "refId": "A"
} }
], ],
@ -575,6 +579,5 @@ data:
}, },
"title": "Atlas SRE Overview", "title": "Atlas SRE Overview",
"uid": "atlas-sre", "uid": "atlas-sre",
"version": 2 "version": 4
} }