atlas internal dashboards: add SLO/burn and api health panels

This commit is contained in:
Brad Stein 2025-12-12 18:00:43 -03:00
parent 0a0966db78
commit bf6179f907
5 changed files with 1171 additions and 178 deletions

View File

@ -327,6 +327,34 @@ NET_INTERNAL_EXPR = (
'+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))' '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
' or on() vector(0)' ' or on() vector(0)'
) )
APISERVER_5XX_RATE = 'sum(rate(apiserver_request_total{code=~"5.."}[5m]))'
APISERVER_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
)
ETCD_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_TOTAL_5M = "sum(rate(traefik_entrypoint_requests_total[5m]))"
TRAEFIK_SUCCESS_5M = 'sum(rate(traefik_entrypoint_requests_total{code!~"5.."}[5m]))'
TRAEFIK_SLI_5M = f"({TRAEFIK_SUCCESS_5M}) / clamp_min({TRAEFIK_TOTAL_5M}, 1)"
TRAEFIK_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_P95_LATENCY_MS = (
"histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
SLO_AVAILABILITY = 0.999
def traefik_sli(window):
total = f'sum(rate(traefik_entrypoint_requests_total[{window}]))'
success = f'sum(rate(traefik_entrypoint_requests_total{{code!~"5.."}}[{window}]))'
return f"({success}) / clamp_min({total}, 1)"
def traefik_burn(window):
sli = traefik_sli(window)
return f"(1 - ({sli})) / {1 - SLO_AVAILABILITY}"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Panel factories # Panel factories
@ -1067,12 +1095,69 @@ def build_nodes_dashboard():
{"h": 4, "w": 8, "x": 16, "y": 0}, {"h": 4, "w": 8, "x": 16, "y": 0},
) )
) )
panels.append(
stat_panel(
9,
"API Server 5xx rate",
APISERVER_5XX_RATE,
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 0.05},
{"color": "orange", "value": 0.2},
{"color": "red", "value": 0.5},
],
},
decimals=3,
)
)
panels.append(
stat_panel(
10,
"API Server P99 latency",
APISERVER_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 250},
{"color": "orange", "value": 400},
{"color": "red", "value": 600},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
11,
"etcd P99 latency",
ETCD_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 100},
{"color": "red", "value": 200},
],
},
decimals=1,
)
)
panels.append( panels.append(
timeseries_panel( timeseries_panel(
4, 4,
"Node CPU", "Node CPU",
node_cpu_expr(), node_cpu_expr(),
{"h": 9, "w": 24, "x": 0, "y": 4}, {"h": 9, "w": 24, "x": 0, "y": 8},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_calcs=["last"], legend_calcs=["last"],
@ -1085,7 +1170,7 @@ def build_nodes_dashboard():
5, 5,
"Node RAM", "Node RAM",
node_mem_expr(), node_mem_expr(),
{"h": 9, "w": 24, "x": 0, "y": 13}, {"h": 9, "w": 24, "x": 0, "y": 17},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_calcs=["last"], legend_calcs=["last"],
@ -1098,7 +1183,7 @@ def build_nodes_dashboard():
6, 6,
"Control Plane (incl. titan-db) CPU", "Control Plane (incl. titan-db) CPU",
node_cpu_expr(CONTROL_ALL_REGEX), node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 0, "y": 22}, {"h": 9, "w": 12, "x": 0, "y": 26},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_display="table", legend_display="table",
@ -1110,7 +1195,7 @@ def build_nodes_dashboard():
7, 7,
"Control Plane (incl. titan-db) RAM", "Control Plane (incl. titan-db) RAM",
node_mem_expr(CONTROL_ALL_REGEX), node_mem_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 12, "y": 22}, {"h": 9, "w": 12, "x": 12, "y": 26},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_display="table", legend_display="table",
@ -1122,7 +1207,7 @@ def build_nodes_dashboard():
8, 8,
"Root Filesystem Usage", "Root Filesystem Usage",
root_usage_expr(), root_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 31}, {"h": 9, "w": 24, "x": 0, "y": 35},
unit="percent", unit="percent",
legend="{{node}}", legend="{{node}}",
legend_display="table", legend_display="table",
@ -1249,43 +1334,107 @@ def build_network_dashboard():
panels.append( panels.append(
stat_panel( stat_panel(
1, 1,
"Ingress Traffic", "Ingress Success Rate (5m)",
NET_INGRESS_EXPR, TRAEFIK_SLI_5M,
{"h": 4, "w": 8, "x": 0, "y": 0}, {"h": 4, "w": 6, "x": 0, "y": 0},
unit="Bps", unit="percentunit",
decimals=2,
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 0.995},
{"color": "yellow", "value": 0.999},
{"color": "green", "value": 0.9995},
],
},
) )
) )
panels.append( panels.append(
stat_panel( stat_panel(
2, 2,
"Egress Traffic", "Error Budget Burn (1h)",
NET_EGRESS_EXPR, traefik_burn("1h"),
{"h": 4, "w": 8, "x": 8, "y": 0}, {"h": 4, "w": 6, "x": 6, "y": 0},
unit="Bps", thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
) )
) )
panels.append( panels.append(
stat_panel( stat_panel(
3, 3,
"Intra-Cluster Traffic", "Error Budget Burn (6h)",
NET_INTERNAL_EXPR, traefik_burn("6h"),
{"h": 4, "w": 8, "x": 16, "y": 0}, {"h": 4, "w": 6, "x": 12, "y": 0},
unit="Bps", thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
) )
) )
panels.append( panels.append(
stat_panel( stat_panel(
4, 4,
"Top Router req/s", "Edge P99 Latency (ms)",
f"topk(1, {TRAEFIK_ROUTER_EXPR})", TRAEFIK_P99_LATENCY_MS,
{"h": 4, "w": 6, "x": 18, "y": 0},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 200},
{"color": "orange", "value": 350},
{"color": "red", "value": 500},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
5,
"Ingress Traffic",
NET_INGRESS_EXPR,
{"h": 4, "w": 8, "x": 0, "y": 4}, {"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s", unit="Bps",
legend="{{router}}", )
)
panels.append(
stat_panel(
6,
"Egress Traffic",
NET_EGRESS_EXPR,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="Bps",
)
)
panels.append(
stat_panel(
7,
"Intra-Cluster Traffic",
NET_INTERNAL_EXPR,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="Bps",
) )
) )
panels.append( panels.append(
timeseries_panel( timeseries_panel(
5, 8,
"Per-Node Throughput", "Per-Node Throughput",
f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})', f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
{"h": 8, "w": 24, "x": 0, "y": 8}, {"h": 8, "w": 24, "x": 0, "y": 8},
@ -1297,7 +1446,7 @@ def build_network_dashboard():
) )
panels.append( panels.append(
table_panel( table_panel(
6, 9,
"Top Namespaces", "Top Namespaces",
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
@ -1308,7 +1457,7 @@ def build_network_dashboard():
) )
panels.append( panels.append(
table_panel( table_panel(
7, 10,
"Top Pods", "Top Pods",
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
@ -1319,7 +1468,7 @@ def build_network_dashboard():
) )
panels.append( panels.append(
timeseries_panel( timeseries_panel(
8, 11,
"Traefik Routers (req/s)", "Traefik Routers (req/s)",
f"topk(10, {TRAEFIK_ROUTER_EXPR})", f"topk(10, {TRAEFIK_ROUTER_EXPR})",
{"h": 9, "w": 12, "x": 0, "y": 25}, {"h": 9, "w": 12, "x": 0, "y": 25},
@ -1331,7 +1480,7 @@ def build_network_dashboard():
) )
panels.append( panels.append(
timeseries_panel( timeseries_panel(
9, 12,
"Traefik Entrypoints (req/s)", "Traefik Entrypoints (req/s)",
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))', 'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
{"h": 9, "w": 12, "x": 12, "y": 25}, {"h": 9, "w": 12, "x": 12, "y": 25},

View File

@ -7,6 +7,282 @@
{ {
"id": 1, "id": 1,
"type": "stat", "type": "stat",
"title": "Ingress Success Rate (5m)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.995
},
{
"color": "yellow",
"value": 0.999
},
{
"color": "green",
"value": 0.9995
}
]
},
"unit": "percentunit",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 2,
"type": "stat",
"title": "Error Budget Burn (1h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 3,
"type": "stat",
"title": "Error Budget Burn (6h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Edge P99 Latency (ms)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 200
},
{
"color": "orange",
"value": 350
},
{
"color": "red",
"value": 500
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "stat",
"title": "Ingress Traffic", "title": "Ingress Traffic",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -16,7 +292,7 @@
"h": 4, "h": 4,
"w": 8, "w": 8,
"x": 0, "x": 0,
"y": 0 "y": 4
}, },
"targets": [ "targets": [
{ {
@ -65,7 +341,7 @@
} }
}, },
{ {
"id": 2, "id": 6,
"type": "stat", "type": "stat",
"title": "Egress Traffic", "title": "Egress Traffic",
"datasource": { "datasource": {
@ -76,7 +352,7 @@
"h": 4, "h": 4,
"w": 8, "w": 8,
"x": 8, "x": 8,
"y": 0 "y": 4
}, },
"targets": [ "targets": [
{ {
@ -125,7 +401,7 @@
} }
}, },
{ {
"id": 3, "id": 7,
"type": "stat", "type": "stat",
"title": "Intra-Cluster Traffic", "title": "Intra-Cluster Traffic",
"datasource": { "datasource": {
@ -136,7 +412,7 @@
"h": 4, "h": 4,
"w": 8, "w": 8,
"x": 16, "x": 16,
"y": 0 "y": 4
}, },
"targets": [ "targets": [
{ {
@ -185,68 +461,7 @@
} }
}, },
{ {
"id": 4, "id": 8,
"type": "stat",
"title": "Top Router req/s",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A",
"legendFormat": "{{router}}"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "timeseries", "type": "timeseries",
"title": "Per-Node Throughput", "title": "Per-Node Throughput",
"datasource": { "datasource": {
@ -283,7 +498,7 @@
} }
}, },
{ {
"id": 6, "id": 9,
"type": "table", "type": "table",
"title": "Top Namespaces", "title": "Top Namespaces",
"datasource": { "datasource": {
@ -319,7 +534,7 @@
] ]
}, },
{ {
"id": 7, "id": 10,
"type": "table", "type": "table",
"title": "Top Pods", "title": "Top Pods",
"datasource": { "datasource": {
@ -355,7 +570,7 @@
] ]
}, },
{ {
"id": 8, "id": 11,
"type": "timeseries", "type": "timeseries",
"title": "Traefik Routers (req/s)", "title": "Traefik Routers (req/s)",
"datasource": { "datasource": {
@ -392,7 +607,7 @@
} }
}, },
{ {
"id": 9, "id": 12,
"type": "timeseries", "type": "timeseries",
"title": "Traefik Entrypoints (req/s)", "title": "Traefik Entrypoints (req/s)",
"datasource": { "datasource": {

View File

@ -186,6 +186,213 @@
"textMode": "value" "textMode": "value"
} }
}, },
{
"id": 9,
"type": "stat",
"title": "API Server 5xx rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 0.05
},
{
"color": "orange",
"value": 0.2
},
{
"color": "red",
"value": 0.5
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 10,
"type": "stat",
"title": "API Server P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 250
},
{
"color": "orange",
"value": 400
},
{
"color": "red",
"value": 600
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 11,
"type": "stat",
"title": "etcd P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 100
},
{
"color": "red",
"value": 200
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{ {
"id": 4, "id": 4,
"type": "timeseries", "type": "timeseries",
@ -198,7 +405,7 @@
"h": 9, "h": 9,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 4 "y": 8
}, },
"targets": [ "targets": [
{ {
@ -238,7 +445,7 @@
"h": 9, "h": 9,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 13 "y": 17
}, },
"targets": [ "targets": [
{ {
@ -278,7 +485,7 @@
"h": 9, "h": 9,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 22 "y": 26
}, },
"targets": [ "targets": [
{ {
@ -315,7 +522,7 @@
"h": 9, "h": 9,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 22 "y": 26
}, },
"targets": [ "targets": [
{ {
@ -352,7 +559,7 @@
"h": 9, "h": 9,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 31 "y": 35
}, },
"targets": [ "targets": [
{ {

View File

@ -16,6 +16,282 @@ data:
{ {
"id": 1, "id": 1,
"type": "stat", "type": "stat",
"title": "Ingress Success Rate (5m)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "(sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[5m]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[5m])), 1)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 0.995
},
{
"color": "yellow",
"value": 0.999
},
{
"color": "green",
"value": 0.9995
}
]
},
"unit": "percentunit",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 2,
"type": "stat",
"title": "Error Budget Burn (1h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[1h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[1h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 3,
"type": "stat",
"title": "Error Budget Burn (6h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "(1 - ((sum(rate(traefik_entrypoint_requests_total{code!~\"5..\"}[6h]))) / clamp_min(sum(rate(traefik_entrypoint_requests_total[6h])), 1))) / 0.0010000000000000009",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "orange",
"value": 2
},
{
"color": "red",
"value": 4
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 2
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Edge P99 Latency (ms)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 200
},
{
"color": "orange",
"value": 350
},
{
"color": "red",
"value": 500
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "stat",
"title": "Ingress Traffic", "title": "Ingress Traffic",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@ -25,7 +301,7 @@ data:
"h": 4, "h": 4,
"w": 8, "w": 8,
"x": 0, "x": 0,
"y": 0 "y": 4
}, },
"targets": [ "targets": [
{ {
@ -74,7 +350,7 @@ data:
} }
}, },
{ {
"id": 2, "id": 6,
"type": "stat", "type": "stat",
"title": "Egress Traffic", "title": "Egress Traffic",
"datasource": { "datasource": {
@ -85,7 +361,7 @@ data:
"h": 4, "h": 4,
"w": 8, "w": 8,
"x": 8, "x": 8,
"y": 0 "y": 4
}, },
"targets": [ "targets": [
{ {
@ -134,7 +410,7 @@ data:
} }
}, },
{ {
"id": 3, "id": 7,
"type": "stat", "type": "stat",
"title": "Intra-Cluster Traffic", "title": "Intra-Cluster Traffic",
"datasource": { "datasource": {
@ -145,7 +421,7 @@ data:
"h": 4, "h": 4,
"w": 8, "w": 8,
"x": 16, "x": 16,
"y": 0 "y": 4
}, },
"targets": [ "targets": [
{ {
@ -194,68 +470,7 @@ data:
} }
}, },
{ {
"id": 4, "id": 8,
"type": "stat",
"title": "Top Router req/s",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "topk(1, sum by (router) (rate(traefik_router_requests_total[5m])))",
"refId": "A",
"legendFormat": "{{router}}"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "timeseries", "type": "timeseries",
"title": "Per-Node Throughput", "title": "Per-Node Throughput",
"datasource": { "datasource": {
@ -292,7 +507,7 @@ data:
} }
}, },
{ {
"id": 6, "id": 9,
"type": "table", "type": "table",
"title": "Top Namespaces", "title": "Top Namespaces",
"datasource": { "datasource": {
@ -328,7 +543,7 @@ data:
] ]
}, },
{ {
"id": 7, "id": 10,
"type": "table", "type": "table",
"title": "Top Pods", "title": "Top Pods",
"datasource": { "datasource": {
@ -364,7 +579,7 @@ data:
] ]
}, },
{ {
"id": 8, "id": 11,
"type": "timeseries", "type": "timeseries",
"title": "Traefik Routers (req/s)", "title": "Traefik Routers (req/s)",
"datasource": { "datasource": {
@ -401,7 +616,7 @@ data:
} }
}, },
{ {
"id": 9, "id": 12,
"type": "timeseries", "type": "timeseries",
"title": "Traefik Entrypoints (req/s)", "title": "Traefik Entrypoints (req/s)",
"datasource": { "datasource": {

View File

@ -195,6 +195,213 @@ data:
"textMode": "value" "textMode": "value"
} }
}, },
{
"id": 9,
"type": "stat",
"title": "API Server 5xx rate",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "sum(rate(apiserver_request_total{code=~\"5..\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 0.05
},
{
"color": "orange",
"value": 0.2
},
{
"color": "red",
"value": 0.5
}
]
},
"unit": "req/s",
"custom": {
"displayMode": "auto"
},
"decimals": 3
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 10,
"type": "stat",
"title": "API Server P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 250
},
{
"color": "orange",
"value": 400
},
{
"color": "red",
"value": 600
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 11,
"type": "stat",
"title": "etcd P99 latency",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 4
},
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "orange",
"value": 100
},
{
"color": "red",
"value": 200
}
]
},
"unit": "ms",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{ {
"id": 4, "id": 4,
"type": "timeseries", "type": "timeseries",
@ -207,7 +414,7 @@ data:
"h": 9, "h": 9,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 4 "y": 8
}, },
"targets": [ "targets": [
{ {
@ -247,7 +454,7 @@ data:
"h": 9, "h": 9,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 13 "y": 17
}, },
"targets": [ "targets": [
{ {
@ -287,7 +494,7 @@ data:
"h": 9, "h": 9,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 22 "y": 26
}, },
"targets": [ "targets": [
{ {
@ -324,7 +531,7 @@ data:
"h": 9, "h": 9,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 22 "y": 26
}, },
"targets": [ "targets": [
{ {
@ -361,7 +568,7 @@ data:
"h": 9, "h": 9,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 31 "y": 35
}, },
"targets": [ "targets": [
{ {