monitoring: bound atlas availability query

This commit is contained in:
jenkins 2026-05-10 14:40:55 -03:00
parent 521eda1c00
commit 32ffe30145
4 changed files with 38 additions and 18 deletions

View File

@ -312,7 +312,7 @@ STUCK_TERMINATING_EXPR = (
')) ' ')) '
"or on() vector(0)" "or on() vector(0)"
) )
UPTIME_WINDOW = "365d" UPTIME_WINDOW = "30d"
# Keep the subquery step coarse so we don't request an excessive number of points. # Keep the subquery step coarse so we don't request an excessive number of points.
UPTIME_STEP = "1h" UPTIME_STEP = "1h"
TRAEFIK_READY_EXPR = ( TRAEFIK_READY_EXPR = (
@ -1433,13 +1433,15 @@ def build_overview():
}, },
{ {
"id": 27, "id": 27,
"title": "Atlas Availability", "title": "Atlas Availability (30d)",
"expr": UPTIME_PERCENT_EXPR, "expr": UPTIME_PERCENT_EXPR,
"kind": "stat", "kind": "stat",
"thresholds": UPTIME_PERCENT_THRESHOLDS, "thresholds": UPTIME_PERCENT_THRESHOLDS,
"unit": "percentunit", "unit": "percentunit",
"decimals": 4, "decimals": 4,
"text_mode": "value", "text_mode": "value",
"instant": True,
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime.",
}, },
{ {
"id": 4, "id": 4,
@ -1509,14 +1511,16 @@ def build_overview():
item["expr"], item["expr"],
grid, grid,
thresholds=item.get("thresholds"), thresholds=item.get("thresholds"),
legend=None, legend=None,
links=item.get("links"), links=item.get("links"),
text_mode=item.get("text_mode", "value"), text_mode=item.get("text_mode", "value"),
value_suffix=item.get("value_suffix"), value_suffix=item.get("value_suffix"),
unit=item.get("unit", "none"), unit=item.get("unit", "none"),
decimals=item.get("decimals"), decimals=item.get("decimals"),
) instant=item.get("instant", False),
) description=item.get("description"),
)
)
else: else:
panels.append( panels.append(
gauge_panel( gauge_panel(

View File

@ -50,6 +50,18 @@ def test_node_filter_and_expr_helpers():
assert "node_memory_MemAvailable_bytes" in mem_expr assert "node_memory_MemAvailable_bytes" in mem_expr
def test_overview_availability_panel_is_recent_and_instant():
mod = load_module()
dashboard = mod.build_overview()
panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27)
assert panel["title"] == "Atlas Availability (30d)"
assert "[30d:1h]" in panel["targets"][0]["expr"]
assert "365d" not in panel["targets"][0]["expr"]
assert panel["targets"][0]["instant"] is True
assert "pre-metric history" in panel["description"]
def test_render_configmap_writes(tmp_path): def test_render_configmap_writes(tmp_path):
mod = load_module() mod = load_module()
mod.DASHBOARD_DIR = tmp_path / "dash" mod.DASHBOARD_DIR = tmp_path / "dash"

View File

@ -213,7 +213,7 @@
{ {
"id": 27, "id": 27,
"type": "stat", "type": "stat",
"title": "Atlas Availability", "title": "Atlas Availability (30d)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -226,8 +226,9 @@
}, },
"targets": [ "targets": [
{ {
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])", "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])",
"refId": "A" "refId": "A",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -281,7 +282,8 @@
"values": false "values": false
}, },
"textMode": "value" "textMode": "value"
} },
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime."
}, },
{ {
"id": 4, "id": 4,

View File

@ -222,7 +222,7 @@ data:
{ {
"id": 27, "id": 27,
"type": "stat", "type": "stat",
"title": "Atlas Availability", "title": "Atlas Availability (30d)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -235,8 +235,9 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])", "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])",
"refId": "A" "refId": "A",
"instant": true
} }
], ],
"fieldConfig": { "fieldConfig": {
@ -290,7 +291,8 @@ data:
"values": false "values": false
}, },
"textMode": "value" "textMode": "value"
} },
"description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime."
}, },
{ {
"id": 4, "id": 4,