atlas overview: show availability percent and keep uptime centered

This commit is contained in:
Brad Stein 2025-12-12 16:11:28 -03:00
parent 697ce3c18f
commit b200dba5b9
3 changed files with 26 additions and 16 deletions

View File

@ -214,6 +214,7 @@ UPTIME_AVAIL_EXPR = (
f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))"
)
UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:5m])"
UPTIME_PERCENT_EXPR = f"({UPTIME_AVG_EXPR}) * 100"
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
UPTIME_THRESHOLDS = {
"mode": "absolute",
@ -224,6 +225,15 @@ UPTIME_THRESHOLDS = {
{"color": "green", "value": 3.5},
],
}
UPTIME_PERCENT_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 99},
{"color": "yellow", "value": 99.9},
{"color": "green", "value": 99.99},
],
}
PROBLEM_TABLE_EXPR = (
"(time() - kube_pod_created{pod!=\"\"}) "
"* on(namespace,pod) group_left(node) kube_pod_info "
@ -624,11 +634,11 @@ def build_overview():
},
{
"id": 27,
"title": "Atlas Uptime (30d, 9s)",
"expr": UPTIME_NINES_EXPR,
"title": "Atlas Availability (30d)",
"expr": UPTIME_PERCENT_EXPR,
"kind": "stat",
"thresholds": UPTIME_THRESHOLDS,
"value_suffix": " 9s",
"thresholds": UPTIME_PERCENT_THRESHOLDS,
"value_suffix": "%",
"text_mode": "value",
},
{

View File

@ -213,7 +213,7 @@
{
"id": 27,
"type": "stat",
"title": "Atlas Uptime (30d, 9s)",
"title": "Atlas Availability (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -226,7 +226,7 @@
},
"targets": [
{
"expr": "-log10(1 - clamp_max(avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m]), 0.999999999))",
"expr": "(avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m])) * 100",
"refId": "A"
}
],
@ -245,22 +245,22 @@
},
{
"color": "orange",
"value": 2
"value": 99
},
{
"color": "yellow",
"value": 3
"value": 99.9
},
{
"color": "green",
"value": 3.5
"value": 99.99
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": " 9s"
"valueSuffix": "%"
}
},
"overrides": []

View File

@ -222,7 +222,7 @@ data:
{
"id": 27,
"type": "stat",
"title": "Atlas Uptime (30d, 9s)",
"title": "Atlas Availability (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -235,7 +235,7 @@ data:
},
"targets": [
{
"expr": "-log10(1 - clamp_max(avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m]), 0.999999999))",
"expr": "(avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m])) * 100",
"refId": "A"
}
],
@ -254,22 +254,22 @@ data:
},
{
"color": "orange",
"value": 2
"value": 99
},
{
"color": "yellow",
"value": 3
"value": 99.9
},
{
"color": "green",
"value": 3.5
"value": 99.99
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto",
"valueSuffix": " 9s"
"valueSuffix": "%"
}
},
"overrides": []