diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index f577eab..618cf30 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -81,7 +81,7 @@ CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system" LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" -GAUGE_WIDTHS = [5, 5, 5, 5, 4] +GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4] CONTROL_WORKLOADS_EXPR = ( f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)' ) @@ -198,6 +198,18 @@ STUCK_TERMINATING_EXPR = ( ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)' '))' ) +UPTIME_WINDOW = "30d" +UPTIME_AVG_EXPR = f"avg(avg_over_time(up[{UPTIME_WINDOW}]))" +UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999))" +UPTIME_THRESHOLDS = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 2}, + {"color": "yellow", "value": 3}, + {"color": "green", "value": 3.5}, + ], +} PROBLEM_TABLE_EXPR = ( "(time() - kube_pod_created{pod!=\"\"}) " "* on(namespace,pod) group_left(node) kube_pod_info " @@ -555,61 +567,24 @@ def link_to(uid): def build_overview(): panels = [] + count_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 2}, + {"color": "red", "value": 3}, + ], + } + row1_stats = [ - ( - 1, - "Workers Ready", - f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', - WORKER_SUFFIX, - WORKER_TOTAL, - None, - ), - ( - 2, - "Control Plane Ready", - f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', - CONTROL_SUFFIX, - CONTROL_TOTAL, - None, - ), - ( - 3, - "Control Plane Workloads", - CONTROL_WORKLOADS_EXPR, - None, - 4, - link_to("atlas-pods"), - ), - ( - 4, - "Problem Pods", - PROBLEM_PODS_EXPR, - None, - 1, - link_to("atlas-pods"), - ), - ( - 5, - "Stuck Terminating", - STUCK_TERMINATING_EXPR, - None, - 1, - link_to("atlas-pods"), - ), - ] - - def gauge_grid(idx): - width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4 - x = sum(GAUGE_WIDTHS[:idx]) - return width, x - - for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): - thresholds = None - min_value = 0 - max_value = ok_value or 5 - if panel_id == 1: - max_value = WORKER_TOTAL - thresholds = { + { + "id": 1, + "title": "Workers Ready", + "expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', + "kind": "gauge", + "max_value": WORKER_TOTAL, + "thresholds": { "mode": "absolute", "steps": [ {"color": "red", "value": None}, @@ -617,60 +592,100 @@ def build_overview(): {"color": "yellow", "value": WORKER_TOTAL - 1}, {"color": "green", "value": WORKER_TOTAL}, ], - } - elif panel_id == 2: - max_value = CONTROL_TOTAL - thresholds = { + }, + }, + { + "id": 2, + "title": "Control Plane Ready", + "expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', + "kind": "gauge", + "max_value": CONTROL_TOTAL, + "thresholds": { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "green", "value": CONTROL_TOTAL}, ], - } - elif panel_id in (3, 4, 5): - max_value = 4 - thresholds = { - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 1}, - {"color": "orange", "value": 2}, - {"color": "red", "value": 3}, - ], - } - else: - thresholds = { - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "red", "value": max_value}, - ], - } + }, + }, + { + "id": 3, + "title": "Control Plane Workloads", + "expr": CONTROL_WORKLOADS_EXPR, + "kind": "stat", + "thresholds": count_thresholds, + "links": link_to("atlas-pods"), + }, + { + "id": 27, + "title": "Atlas Uptime (30d, 9s)", + "expr": UPTIME_NINES_EXPR, + "kind": "stat", + "thresholds": UPTIME_THRESHOLDS, + "value_suffix": " 9s", + "text_mode": "value", + }, + { + "id": 4, + "title": "Problem Pods", + "expr": PROBLEM_PODS_EXPR, + "kind": "stat", + "thresholds": count_thresholds, + "links": link_to("atlas-pods"), + }, + { + "id": 6, + "title": "CrashLoop / ImagePull", + "expr": CRASHLOOP_EXPR, + "kind": "stat", + "thresholds": count_thresholds, + "links": link_to("atlas-pods"), + }, + { + "id": 5, + "title": "Stuck Terminating", + "expr": STUCK_TERMINATING_EXPR, + "kind": "stat", + "thresholds": count_thresholds, + "links": link_to("atlas-pods"), + }, + ] + + def gauge_grid(idx): + width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4 + x = sum(GAUGE_WIDTHS[:idx]) + return width, x + + for idx, item in enumerate(row1_stats): + panel_id = item["id"] width, x = gauge_grid(idx) - if panel_id in (3, 4, 5): + grid = {"h": 5, "w": width, "x": x, "y": 0} + kind = item.get("kind", "gauge") + if kind == "stat": panels.append( stat_panel( panel_id, - title, - expr, - {"h": 5, "w": width, "x": x, "y": 0}, - thresholds=thresholds, + item["title"], + item["expr"], + grid, + thresholds=item.get("thresholds"), legend=None, - links=links, - text_mode="value", + links=item.get("links"), + text_mode=item.get("text_mode", "value"), + value_suffix=item.get("value_suffix"), ) ) else: panels.append( gauge_panel( panel_id, - title, - expr, - {"h": 5, "w": width, "x": x, "y": 0}, - min_value=min_value, - max_value=max_value, - thresholds=thresholds, - links=links, + item["title"], + item["expr"], + grid, + min_value=0, + max_value=item.get("max_value", 5), + thresholds=item.get("thresholds"), + links=item.get("links"), ) ) diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index beb676e..455e9d9 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -17,7 +17,7 @@ }, "gridPos": { "h": 5, - "w": 5, + "w": 4, "x": 0, "y": 0 }, @@ -78,8 +78,8 @@ }, "gridPos": { "h": 5, - "w": 5, - "x": 5, + "w": 3, + "x": 4, "y": 0 }, "targets": [ @@ -131,8 +131,8 @@ }, "gridPos": { "h": 5, - "w": 5, - "x": 10, + "w": 3, + "x": 7, "y": 0 }, "targets": [ @@ -196,6 +196,75 @@ } ] }, + { + "id": 27, + "type": "stat", + "title": "Atlas Uptime (30d, 9s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 10, + "y": 0 + }, + "targets": [ + { + "expr": "-log10(1 - clamp_max(avg(avg_over_time(up[30d])), 0.999999))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "yellow", + "value": 3 + }, + { + "color": "green", + "value": 3.5 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": " 9s" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, { "id": 4, "type": "stat", @@ -206,8 +275,8 @@ }, "gridPos": { "h": 5, - "w": 5, - "x": 15, + "w": 3, + "x": 14, "y": 0 }, "targets": [ @@ -271,6 +340,81 @@ } ] }, + { + "id": 6, + "type": "stat", + "title": "CrashLoop / ImagePull", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 17, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] + }, { "id": 5, "type": "stat", diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index ef17ebf..7d6d57f 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -26,7 +26,7 @@ data: }, "gridPos": { "h": 5, - "w": 5, + "w": 4, "x": 0, "y": 0 }, @@ -87,8 +87,8 @@ data: }, "gridPos": { "h": 5, - "w": 5, - "x": 5, + "w": 3, + "x": 4, "y": 0 }, "targets": [ @@ -140,8 +140,8 @@ data: }, "gridPos": { "h": 5, - "w": 5, - "x": 10, + "w": 3, + "x": 7, "y": 0 }, "targets": [ @@ -205,6 +205,75 @@ data: } ] }, + { + "id": 27, + "type": "stat", + "title": "Atlas Uptime (30d, 9s)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 10, + "y": 0 + }, + "targets": [ + { + "expr": "-log10(1 - clamp_max(avg(avg_over_time(up[30d])), 0.999999))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "yellow", + "value": 3 + }, + { + "color": "green", + "value": 3.5 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto", + "valueSuffix": " 9s" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, { "id": 4, "type": "stat", @@ -215,8 +284,8 @@ data: }, "gridPos": { "h": 5, - "w": 5, - "x": 15, + "w": 3, + "x": 14, "y": 0 }, "targets": [ @@ -280,6 +349,81 @@ data: } ] }, + { + "id": 6, + "type": "stat", + "title": "CrashLoop / ImagePull", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 17, + "y": 0 + }, + "targets": [ + { + "expr": "sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"}))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-pods dashboard", + "url": "/d/atlas-pods", + "targetBlank": true + } + ] + }, { "id": 5, "type": "stat",