atlas internal dashboards: add SLO/burn and api health panels

This commit is contained in:
Brad Stein 2025-12-12 18:00:43 -03:00
parent 0a0966db78
commit bf6179f907
5 changed files with 1171 additions and 178 deletions

View File

@ -327,6 +327,34 @@ NET_INTERNAL_EXPR = (
'+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
' or on() vector(0)'
)
APISERVER_5XX_RATE = 'sum(rate(apiserver_request_total{code=~"5.."}[5m]))'
APISERVER_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
)
ETCD_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_TOTAL_5M = "sum(rate(traefik_entrypoint_requests_total[5m]))"
TRAEFIK_SUCCESS_5M = 'sum(rate(traefik_entrypoint_requests_total{code!~"5.."}[5m]))'
TRAEFIK_SLI_5M = f"({TRAEFIK_SUCCESS_5M}) / clamp_min({TRAEFIK_TOTAL_5M}, 1)"
TRAEFIK_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_P95_LATENCY_MS = (
"histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
SLO_AVAILABILITY = 0.999
def traefik_sli(window):
total = f'sum(rate(traefik_entrypoint_requests_total[{window}]))'
success = f'sum(rate(traefik_entrypoint_requests_total{{code!~"5.."}}[{window}]))'
return f"({success}) / clamp_min({total}, 1)"
def traefik_burn(window):
sli = traefik_sli(window)
return f"(1 - ({sli})) / {1 - SLO_AVAILABILITY}"
# ---------------------------------------------------------------------------
# Panel factories
@ -1067,12 +1095,69 @@ def build_nodes_dashboard():
{"h": 4, "w": 8, "x": 16, "y": 0},
)
)
panels.append(
stat_panel(
9,
"API Server 5xx rate",
APISERVER_5XX_RATE,
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 0.05},
{"color": "orange", "value": 0.2},
{"color": "red", "value": 0.5},
],
},
decimals=3,
)
)
panels.append(
stat_panel(
10,
"API Server P99 latency",
APISERVER_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 250},
{"color": "orange", "value": 400},
{"color": "red", "value": 600},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
11,
"etcd P99 latency",
ETCD_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 100},
{"color": "red", "value": 200},
],
},
decimals=1,
)
)
panels.append(
timeseries_panel(
4,
"Node CPU",
node_cpu_expr(),
{"h": 9, "w": 24, "x": 0, "y": 4},
{"h": 9, "w": 24, "x": 0, "y": 8},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1085,7 +1170,7 @@ def build_nodes_dashboard():
5,
"Node RAM",
node_mem_expr(),
{"h": 9, "w": 24, "x": 0, "y": 13},
{"h": 9, "w": 24, "x": 0, "y": 17},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
@ -1098,7 +1183,7 @@ def build_nodes_dashboard():
6,
"Control Plane (incl. titan-db) CPU",
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 0, "y": 22},
{"h": 9, "w": 12, "x": 0, "y": 26},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1110,7 +1195,7 @@ def build_nodes_dashboard():
7,
"Control Plane (incl. titan-db) RAM",
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 12, "y": 22},
{"h": 9, "w": 12, "x": 12, "y": 26},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1122,7 +1207,7 @@ def build_nodes_dashboard():
8,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 31},
{"h": 9, "w": 24, "x": 0, "y": 35},
unit="percent",
legend="{{node}}",
legend_display="table",
@ -1249,43 +1334,107 @@ def build_network_dashboard():
panels.append(
stat_panel(
1,
"Ingress Traffic",
NET_INGRESS_EXPR,
{"h": 4, "w": 8, "x": 0, "y": 0},
unit="Bps",
"Ingress Success Rate (5m)",
TRAEFIK_SLI_5M,
{"h": 4, "w": 6, "x": 0, "y": 0},
unit="percentunit",
decimals=2,
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 0.995},
{"color": "yellow", "value": 0.999},
{"color": "green", "value": 0.9995},
],
},
)
)
panels.append(
stat_panel(
2,
"Egress Traffic",
NET_EGRESS_EXPR,
{"h": 4, "w": 8, "x": 8, "y": 0},
unit="Bps",
"Error Budget Burn (1h)",
traefik_burn("1h"),
{"h": 4, "w": 6, "x": 6, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
)
)
panels.append(
stat_panel(
3,
"Intra-Cluster Traffic",
NET_INTERNAL_EXPR,
{"h": 4, "w": 8, "x": 16, "y": 0},
unit="Bps",
"Error Budget Burn (6h)",
traefik_burn("6h"),
{"h": 4, "w": 6, "x": 12, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
)
)
panels.append(
stat_panel(
4,
"Top Router req/s",
f"topk(1, {TRAEFIK_ROUTER_EXPR})",
"Edge P99 Latency (ms)",
TRAEFIK_P99_LATENCY_MS,
{"h": 4, "w": 6, "x": 18, "y": 0},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 200},
{"color": "orange", "value": 350},
{"color": "red", "value": 500},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
5,
"Ingress Traffic",
NET_INGRESS_EXPR,
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s",
legend="{{router}}",
unit="Bps",
)
)
panels.append(
stat_panel(
6,
"Egress Traffic",
NET_EGRESS_EXPR,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="Bps",
)
)
panels.append(
stat_panel(
7,
"Intra-Cluster Traffic",
NET_INTERNAL_EXPR,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="Bps",
)
)
panels.append(
timeseries_panel(
5,
8,
"Per-Node Throughput",
f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
{"h": 8, "w": 24, "x": 0, "y": 8},
@ -1297,7 +1446,7 @@ def build_network_dashboard():
)
panels.append(
table_panel(
6,
9,
"Top Namespaces",
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
@ -1308,7 +1457,7 @@ def build_network_dashboard():
)
panels.append(
table_panel(
7,
10,
"Top Pods",
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
@ -1319,7 +1468,7 @@ def build_network_dashboard():
)
panels.append(
timeseries_panel(
8,
11,
"Traefik Routers (req/s)",
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
{"h": 9, "w": 12, "x": 0, "y": 25},
@ -1331,7 +1480,7 @@ def build_network_dashboard():
)
panels.append(
timeseries_panel(
9,
12,
"Traefik Entrypoints (req/s)",
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
{"h": 9, "w": 12, "x": 12, "y": 25},

View File

@ -7,6 +7,282 @@
{
"id": 1,
"type": "stat",
"title": "Ingress Success Rate (5m)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.995
},
{
"color": "yellow",
"value": 0.999
},
{
"color": "green",
"value": 0.9995
}
]
},
"unit": "percentunit",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 2,
"type": "stat",
"title": "Error Budget Burn (1h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 3,
"type": "stat",
"title": "Error Budget Burn (6h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Edge P99 Latency (ms)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 200
},
{
"color": "orange",
"value": 350
},
{
"color": "red",
"value": 500
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "stat",
"title": "Ingress Traffic",
"datasource": {
"type": "prometheus",
@ -16,7 +292,7 @@
"h": 4,
"w": 8,
"x": 0,
"y": 0
"y": 4
},
"targets": [
{
@ -65,7 +341,7 @@
}
},
{
"id": 2,
"id": 6,
"type": "stat",
"title": "Egress Traffic",
"datasource": {
@ -76,7 +352,7 @@
"h": 4,
"w": 8,
"x": 8,
"y": 0
"y": 4
},
"targets": [
{
@ -125,7 +401,7 @@
}
},
{
"id": 3,
"id": 7,
"type": "stat",
"title": "Intra-Cluster Traffic",
"datasource": {
@ -136,7 +412,7 @@
"h": 4,
"w": 8,
"x": 16,
"y": 0
"y": 4
},
"targets": [
{
@ -185,68 +461,7 @@
}
},
{
"id": 4,
"type": "stat",
"title": "Top Router req/s",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A",
"legendFormat": "{{router}}"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"id": 8,
"type": "timeseries",
"title": "Per-Node Throughput",
"datasource": {
@ -283,7 +498,7 @@
}
},
{
"id": 6,
"id": 9,
"type": "table",
"title": "Top Namespaces",
"datasource": {
@ -319,7 +534,7 @@
]
},
{
"id": 7,
"id": 10,
"type": "table",
"title": "Top Pods",
"datasource": {
@ -355,7 +570,7 @@
]
},
{
"id": 8,
"id": 11,
"type": "timeseries",
"title": "Traefik Routers (req/s)",
"datasource": {
@ -392,7 +607,7 @@
}
},
{
"id": 9,
"id": 12,
"type": "timeseries",
"title": "Traefik Entrypoints (req/s)",
"datasource": {

View File

@ -186,6 +186,213 @@
"textMode": "value"
}
},
{
"id": 9,
"type": "stat",
"title": "API Server 5xx rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 0.05
},
{
"color": "orange",
"value": 0.2
},
{
"color": "red",
"value": 0.5
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 10,
"type": "stat",
"title": "API Server P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 250
},
{
"color": "orange",
"value": 400
},
{
"color": "red",
"value": 600
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 11,
"type": "stat",
"title": "etcd P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 100
},
{
"color": "red",
"value": 200
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "timeseries",
@ -198,7 +405,7 @@
"h": 9,
"w": 24,
"x": 0,
"y": 4
"y": 8
},
"targets": [
{
@ -238,7 +445,7 @@
"h": 9,
"w": 24,
"x": 0,
"y": 13
"y": 17
},
"targets": [
{
@ -278,7 +485,7 @@
"h": 9,
"w": 12,
"x": 0,
"y": 22
"y": 26
},
"targets": [
{
@ -315,7 +522,7 @@
"h": 9,
"w": 12,
"x": 12,
"y": 22
"y": 26
},
"targets": [
{
@ -352,7 +559,7 @@
"h": 9,
"w": 24,
"x": 0,
"y": 31
"y": 35
},
"targets": [
{

View File

@ -16,6 +16,282 @@ data:
{
"id": 1,
"type": "stat",
"title": "Ingress Success Rate (5m)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.995
},
{
"color": "yellow",
"value": 0.999
},
{
"color": "green",
"value": 0.9995
}
]
},
"unit": "percentunit",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 2,
"type": "stat",
"title": "Error Budget Burn (1h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 3,
"type": "stat",
"title": "Error Budget Burn (6h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Edge P99 Latency (ms)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 200
},
{
"color": "orange",
"value": 350
},
{
"color": "red",
"value": 500
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "stat",
"title": "Ingress Traffic",
"datasource": {
"type": "prometheus",
@ -25,7 +301,7 @@ data:
"h": 4,
"w": 8,
"x": 0,
"y": 0
"y": 4
},
"targets": [
{
@ -74,7 +350,7 @@ data:
}
},
{
"id": 2,
"id": 6,
"type": "stat",
"title": "Egress Traffic",
"datasource": {
@ -85,7 +361,7 @@ data:
"h": 4,
"w": 8,
"x": 8,
"y": 0
"y": 4
},
"targets": [
{
@ -134,7 +410,7 @@ data:
}
},
{
"id": 3,
"id": 7,
"type": "stat",
"title": "Intra-Cluster Traffic",
"datasource": {
@ -145,7 +421,7 @@ data:
"h": 4,
"w": 8,
"x": 16,
"y": 0
"y": 4
},
"targets": [
{
@ -194,68 +470,7 @@ data:
}
},
{
"id": 4,
"type": "stat",
"title": "Top Router req/s",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A",
"legendFormat": "{{router}}"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"id": 8,
"type": "timeseries",
"title": "Per-Node Throughput",
"datasource": {
@ -292,7 +507,7 @@ data:
}
},
{
"id": 6,
"id": 9,
"type": "table",
"title": "Top Namespaces",
"datasource": {
@ -328,7 +543,7 @@ data:
]
},
{
"id": 7,
"id": 10,
"type": "table",
"title": "Top Pods",
"datasource": {
@ -364,7 +579,7 @@ data:
]
},
{
"id": 8,
"id": 11,
"type": "timeseries",
"title": "Traefik Routers (req/s)",
"datasource": {
@ -401,7 +616,7 @@ data:
}
},
{
"id": 9,
"id": 12,
"type": "timeseries",
"title": "Traefik Entrypoints (req/s)",
"datasource": {

View File

@ -195,6 +195,213 @@ data:
"textMode": "value"
}
},
{
"id": 9,
"type": "stat",
"title": "API Server 5xx rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 0.05
},
{
"color": "orange",
"value": 0.2
},
{
"color": "red",
"value": 0.5
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 10,
"type": "stat",
"title": "API Server P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 250
},
{
"color": "orange",
"value": 400
},
{
"color": "red",
"value": 600
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 11,
"type": "stat",
"title": "etcd P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 100
},
{
"color": "red",
"value": 200
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "timeseries",
@ -207,7 +414,7 @@ data:
"h": 9,
"w": 24,
"x": 0,
"y": 4
"y": 8
},
"targets": [
{
@ -247,7 +454,7 @@ data:
"h": 9,
"w": 24,
"x": 0,
"y": 13
"y": 17
},
"targets": [
{
@ -287,7 +494,7 @@ data:
"h": 9,
"w": 12,
"x": 0,
"y": 22
"y": 26
},
"targets": [
{
@ -324,7 +531,7 @@ data:
"h": 9,
"w": 12,
"x": 12,
"y": 22
"y": 26
},
"targets": [
{
@ -361,7 +568,7 @@ data:
"h": 9,
"w": 24,
"x": 0,
"y": 31
"y": 35
},
"targets": [
{