From 32ffe3014511304a713bfb28d55b753204472e0d Mon Sep 17 00:00:00 2001 From: jenkins Date: Sun, 10 May 2026 14:40:55 -0300 Subject: [PATCH] monitoring: bound atlas availability query --- scripts/dashboards_render_atlas.py | 24 +++++++++++-------- scripts/tests/test_dashboards_render_atlas.py | 12 ++++++++++ .../monitoring/dashboards/atlas-overview.json | 10 ++++---- .../grafana-dashboard-overview.yaml | 10 ++++---- 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 331792aa..e97851a7 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -312,7 +312,7 @@ STUCK_TERMINATING_EXPR = ( ')) ' "or on() vector(0)" ) -UPTIME_WINDOW = "365d" +UPTIME_WINDOW = "30d" # Keep the subquery step coarse so we don't request an excessive number of points. UPTIME_STEP = "1h" TRAEFIK_READY_EXPR = ( @@ -1433,13 +1433,15 @@ def build_overview(): }, { "id": 27, - "title": "Atlas Availability", + "title": "Atlas Availability (30d)", "expr": UPTIME_PERCENT_EXPR, "kind": "stat", "thresholds": UPTIME_PERCENT_THRESHOLDS, "unit": "percentunit", "decimals": 4, "text_mode": "value", + "instant": True, + "description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime.", }, { "id": 4, @@ -1509,14 +1511,16 @@ def build_overview(): item["expr"], grid, thresholds=item.get("thresholds"), - legend=None, - links=item.get("links"), - text_mode=item.get("text_mode", "value"), - value_suffix=item.get("value_suffix"), - unit=item.get("unit", "none"), - decimals=item.get("decimals"), - ) - ) + legend=None, + links=item.get("links"), + text_mode=item.get("text_mode", "value"), + value_suffix=item.get("value_suffix"), + unit=item.get("unit", "none"), + decimals=item.get("decimals"), + instant=item.get("instant", False), + description=item.get("description"), + ) + ) else: panels.append( gauge_panel( diff --git a/scripts/tests/test_dashboards_render_atlas.py b/scripts/tests/test_dashboards_render_atlas.py index a763282e..4cc71938 100644 --- a/scripts/tests/test_dashboards_render_atlas.py +++ b/scripts/tests/test_dashboards_render_atlas.py @@ -50,6 +50,18 @@ def test_node_filter_and_expr_helpers(): assert "node_memory_MemAvailable_bytes" in mem_expr +def test_overview_availability_panel_is_recent_and_instant(): + mod = load_module() + dashboard = mod.build_overview() + panel = next(panel for panel in flatten_panels(dashboard["panels"]) if panel["id"] == 27) + + assert panel["title"] == "Atlas Availability (30d)" + assert "[30d:1h]" in panel["targets"][0]["expr"] + assert "365d" not in panel["targets"][0]["expr"] + assert panel["targets"][0]["instant"] is True + assert "pre-metric history" in panel["description"] + + def test_render_configmap_writes(tmp_path): mod = load_module() mod.DASHBOARD_DIR = tmp_path / "dash" diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index e562c84c..933bc621 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -213,7 +213,7 @@ { "id": 27, "type": "stat", - "title": "Atlas Availability", + "title": "Atlas Availability (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -226,8 +226,9 @@ }, "targets": [ { - "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])", - "refId": "A" + "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])", + "refId": "A", + "instant": true } ], "fieldConfig": { @@ -281,7 +282,8 @@ "values": false }, "textMode": "value" - } + }, + "description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime." }, { "id": 4, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 351d8a80..c7aab46e 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -222,7 +222,7 @@ data: { "id": 27, "type": "stat", - "title": "Atlas Availability", + "title": "Atlas Availability (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -235,8 +235,9 @@ data: }, "targets": [ { - "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[365d:1h])", - "refId": "A" + "expr": "avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:1h])", + "refId": "A", + "instant": true } ], "fieldConfig": { @@ -290,7 +291,8 @@ data: "values": false }, "textMode": "value" - } + }, + "description": "Rolling 30-day availability from control-plane readiness and Traefik readiness. Kept to 30d so missing pre-metric history is not counted as downtime." }, { "id": 4,