Reduce Atlas availability query density

This commit is contained in:
Brad Stein 2025-12-19 14:56:29 -03:00
parent 8be89cbd53
commit 89f95157d8
3 changed files with 5 additions and 3 deletions

View File

@ -205,6 +205,8 @@ STUCK_TERMINATING_EXPR = (
"or on() vector(0)" "or on() vector(0)"
) )
UPTIME_WINDOW = "365d" UPTIME_WINDOW = "365d"
# Keep the subquery step coarse so we don't request an excessive number of points.
UPTIME_STEP = "1h"
TRAEFIK_READY_EXPR = ( TRAEFIK_READY_EXPR = (
"(" "("
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})' 'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
@ -225,7 +227,7 @@ NODE_TIEBREAKER = " + ".join(
f"({node_filter(node)}) * 1e-6 * {idx}" f"({node_filter(node)}) * 1e-6 * {idx}"
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1) for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
) )
UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:5m])" UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:{UPTIME_STEP}])"
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))" UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
UPTIME_THRESHOLDS = { UPTIME_THRESHOLDS = {

View File

@ -226,7 +226,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:5m])", "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])",
"refId": "A" "refId": "A"
} }
], ],

View File

@ -235,7 +235,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:5m])", "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])",
"refId": "A" "refId": "A"
} }
], ],