diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 618cf30..1cc89a7 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -199,8 +199,22 @@ STUCK_TERMINATING_EXPR = ( '))' ) UPTIME_WINDOW = "30d" -UPTIME_AVG_EXPR = f"avg(avg_over_time(up[{UPTIME_WINDOW}]))" -UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999))" +TRAEFIK_READY_EXPR = ( + "(" + 'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})' + " / clamp_min(" + 'sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)' + ")" +) +CONTROL_READY_FRACTION_EXPR = ( + f"(sum(kube_node_status_condition{{condition=\"Ready\",status=\"true\",node=~\"{CONTROL_REGEX}\"}})" + f" / {CONTROL_TOTAL})" +) +UPTIME_AVAIL_EXPR = ( + f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))" +) +UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:5m])" +UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))" UPTIME_THRESHOLDS = { "mode": "absolute", "steps": [ @@ -578,22 +592,6 @@ def build_overview(): } row1_stats = [ - { - "id": 1, - "title": "Workers Ready", - "expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', - "kind": "gauge", - "max_value": WORKER_TOTAL, - "thresholds": { - "mode": "absolute", - "steps": [ - {"color": "red", "value": None}, - {"color": "orange", "value": WORKER_TOTAL - 2}, - {"color": "yellow", "value": WORKER_TOTAL - 1}, - {"color": "green", "value": WORKER_TOTAL}, - ], - }, - }, { "id": 2, "title": "Control Plane Ready", @@ -616,6 +614,14 @@ def build_overview(): "thresholds": count_thresholds, "links": link_to("atlas-pods"), }, + { + "id": 5, + "title": "Stuck Terminating", + "expr": STUCK_TERMINATING_EXPR, + "kind": "stat", + "thresholds": count_thresholds, + "links": link_to("atlas-pods"), + }, { "id": 27, "title": "Atlas Uptime (30d, 9s)", @@ -642,12 +648,20 @@ def build_overview(): "links": link_to("atlas-pods"), }, { - "id": 5, - "title": "Stuck Terminating", - "expr": STUCK_TERMINATING_EXPR, - "kind": "stat", - "thresholds": count_thresholds, - "links": link_to("atlas-pods"), + "id": 1, + "title": "Workers Ready", + "expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', + "kind": "gauge", + "max_value": WORKER_TOTAL, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": WORKER_TOTAL - 2}, + {"color": "yellow", "value": WORKER_TOTAL - 1}, + {"color": "green", "value": WORKER_TOTAL}, + ], + }, }, ] diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 455e9d9..36997b8 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -7,67 +7,6 @@ "list": [] }, "panels": [ - { - "id": 1, - "type": "gauge", - "title": "Workers Ready", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "min": 0, - "max": 18, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "orange", - "value": 16 - }, - { - "color": "yellow", - "value": 17 - }, - { - "color": "green", - "value": 18 - } - ] - } - }, - "overrides": [] - }, - "options": { - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false - } - }, { "id": 2, "type": "gauge", @@ -78,8 +17,8 @@ }, "gridPos": { "h": 5, - "w": 3, - "x": 4, + "w": 4, + "x": 0, "y": 0 }, "targets": [ @@ -132,7 +71,7 @@ "gridPos": { "h": 5, "w": 3, - "x": 7, + "x": 4, "y": 0 }, "targets": [ @@ -196,6 +135,81 @@ } ] }, + { + "id": 5, + "type": "stat", + "title": "Stuck Terminating", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 7, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] + }, { "id": 27, "type": "stat", @@ -212,7 +226,7 @@ }, "targets": [ { - "expr": "-log10(1 - clamp_max(avg(avg_over_time(up[30d])), 0.999999))", + "expr": "-log10(1 - clamp_max(avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m]), 0.999999999))", "refId": "A" } ], @@ -416,9 +430,9 @@ ] }, { - "id": 5, - "type": "stat", - "title": "Stuck Terminating", + "id": 1, + "type": "gauge", + "title": "Workers Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -431,48 +445,39 @@ }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 18, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "red", "value": null }, - { - "color": "yellow", - "value": 1 - }, { "color": "orange", - "value": 2 + "value": 16 }, { - "color": "red", - "value": 3 + "color": "yellow", + "value": 17 + }, + { + "color": "green", + "value": 18 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -480,15 +485,10 @@ "fields": "", "values": false }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-pods dashboard", - "url": "/d/atlas-pods", - "targetBlank": true - } - ] + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false + } }, { "id": 7, diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 7d6d57f..c6d1771 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -16,67 +16,6 @@ data: "list": [] }, "panels": [ - { - "id": 1, - "type": "gauge", - "title": "Workers Ready", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 0, - "y": 0 - }, - "targets": [ - { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "min": 0, - "max": 18, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "orange", - "value": 16 - }, - { - "color": "yellow", - "value": 17 - }, - { - "color": "green", - "value": 18 - } - ] - } - }, - "overrides": [] - }, - "options": { - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "orientation": "auto", - "showThresholdMarkers": false, - "showThresholdLabels": false - } - }, { "id": 2, "type": "gauge", @@ -87,8 +26,8 @@ data: }, "gridPos": { "h": 5, - "w": 3, - "x": 4, + "w": 4, + "x": 0, "y": 0 }, "targets": [ @@ -141,7 +80,7 @@ data: "gridPos": { "h": 5, "w": 3, - "x": 7, + "x": 4, "y": 0 }, "targets": [ @@ -205,6 +144,81 @@ data: } ] }, + { + "id": 5, + "type": "stat", + "title": "Stuck Terminating", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 7, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] + }, { "id": 27, "type": "stat", @@ -221,7 +235,7 @@ data: }, "targets": [ { - "expr": "-log10(1 - clamp_max(avg(avg_over_time(up[30d])), 0.999999))", + "expr": "-log10(1 - clamp_max(avg_over_time((min(((sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"}) / 3)), ((sum(kube_deployment_status_replicas_available{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}) / clamp_min(sum(kube_deployment_spec_replicas{namespace=~\"traefik|kube-system\",deployment=\"traefik\"}), 1)))))[30d:5m]), 0.999999999))", "refId": "A" } ], @@ -425,9 +439,9 @@ data: ] }, { - "id": 5, - "type": "stat", - "title": "Stuck Terminating", + "id": 1, + "type": "gauge", + "title": "Workers Ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -440,48 +454,39 @@ data: }, "targets": [ { - "expr": "sum(max by (namespace,pod) (((time() - kube_pod_deletion_timestamp{pod!=\"\"}) > bool 600) and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)))", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-04|titan-05|titan-06|titan-07|titan-08|titan-09|titan-10|titan-11|titan-12|titan-13|titan-14|titan-15|titan-16|titan-17|titan-18|titan-19|titan-22|titan-24\"})", "refId": "A" } ], "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], + "min": 0, + "max": 18, "thresholds": { "mode": "absolute", "steps": [ { - "color": "green", + "color": "red", "value": null }, - { - "color": "yellow", - "value": 1 - }, { "color": "orange", - "value": 2 + "value": 16 }, { - "color": "red", - "value": 3 + "color": "yellow", + "value": 17 + }, + { + "color": "green", + "value": 18 } ] - }, - "unit": "none", - "custom": { - "displayMode": "auto" } }, "overrides": [] }, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", "reduceOptions": { "calcs": [ "lastNotNull" @@ -489,15 +494,10 @@ data: "fields": "", "values": false }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-pods dashboard", - "url": "/d/atlas-pods", - "targetBlank": true - } - ] + "orientation": "auto", + "showThresholdMarkers": false, + "showThresholdLabels": false + } }, { "id": 7,