diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 1cc89a7..76fe0bc 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -214,6 +214,7 @@ UPTIME_AVAIL_EXPR = ( f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))" ) UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:5m])" +UPTIME_PERCENT_EXPR = f"({UPTIME_AVG_EXPR}) * 100" UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))" UPTIME_THRESHOLDS = { "mode": "absolute", @@ -224,6 +225,15 @@ UPTIME_THRESHOLDS = { {"color": "green", "value": 3.5}, ], } +UPTIME_PERCENT_THRESHOLDS = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 99}, + {"color": "yellow", "value": 99.9}, + {"color": "green", "value": 99.99}, + ], +} PROBLEM_TABLE_EXPR = ( "(time() - kube_pod_created{pod!=\"\"}) " "* on(namespace,pod) group_left(node) kube_pod_info " @@ -624,11 +634,11 @@ def build_overview(): }, { "id": 27, - "title": "Atlas Uptime (30d, 9s)", - "expr": UPTIME_NINES_EXPR, + "title": "Atlas Availability (30d)", + "expr": UPTIME_PERCENT_EXPR, "kind": "stat", - "thresholds": UPTIME_THRESHOLDS, - "value_suffix": " 9s", + "thresholds": UPTIME_PERCENT_THRESHOLDS, + "value_suffix": "%", "text_mode": "value", }, { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 36997b8..04e5929 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -213,7 +213,7 @@ { "id": 27, "type": "stat", - "title": "Atlas Uptime (30d, 9s)", + "title": "Atlas Availability (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -226,7 +226,7 @@ }, "targets": [ { - "expr": "-log10(1 - clamp_max(avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m]), 0.999999999))", + "expr": "(avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m])) * 100", "refId": "A" } ], @@ -245,22 +245,22 @@ }, { "color": "orange", - "value": 2 + "value": 99 }, { "color": "yellow", - "value": 3 + "value": 99.9 }, { "color": "green", - "value": 3.5 + "value": 99.99 } ] }, "unit": "none", "custom": { "displayMode": "auto", - "valueSuffix": " 9s" + "valueSuffix": "%" } }, "overrides": [] diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index c6d1771..d68631d 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -222,7 +222,7 @@ data: { "id": 27, "type": "stat", - "title": "Atlas Uptime (30d, 9s)", + "title": "Atlas Availability (30d)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -235,7 +235,7 @@ data: }, "targets": [ { - "expr": "-log10(1 - clamp_max(avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m]), 0.999999999))", + "expr": "(avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m])) * 100", "refId": "A" } ], @@ -254,22 +254,22 @@ data: }, { "color": "orange", - "value": 2 + "value": 99 }, { "color": "yellow", - "value": 3 + "value": 99.9 }, { "color": "green", - "value": 3.5 + "value": 99.99 } ] }, "unit": "none", "custom": { "displayMode": "auto", - "valueSuffix": " 9s" + "valueSuffix": "%" } }, "overrides": []