From 8571ef6f4d2c3f97989e2b22cddfc2686486563d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Sat, 31 Jan 2026 13:37:25 -0300 Subject: [PATCH] cluster state: expand time-series summaries --- ariadne/services/cluster_state.py | 73 +++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/ariadne/services/cluster_state.py b/ariadne/services/cluster_state.py index eac6c60..05967c0 100644 --- a/ariadne/services/cluster_state.py +++ b/ariadne/services/cluster_state.py @@ -1547,6 +1547,64 @@ def _vm_scalar_window(expr: str, window: str, fn: str) -> float | None: return _vm_scalar(f"{fn}(({expr})[{window}])") +def _scalar_trends(expr: str, windows: tuple[str, ...]) -> dict[str, dict[str, float | None]]: + return { + window: { + "avg": _vm_scalar_window(expr, window, "avg_over_time"), + "min": _vm_scalar_window(expr, window, "min_over_time"), + "max": _vm_scalar_window(expr, window, "max_over_time"), + } + for window in windows + } + + +def _cluster_trends() -> dict[str, dict[str, dict[str, float | None]]]: + exprs = { + "nodes_ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})', + "nodes_not_ready": 'sum(kube_node_status_condition{condition="Ready",status="false"})', + "pods_running": 'sum(kube_pod_status_phase{phase="Running"})', + "pods_pending": 'sum(kube_pod_status_phase{phase="Pending"})', + "pods_failed": 'sum(kube_pod_status_phase{phase="Failed"})', + "pods_succeeded": 'sum(kube_pod_status_phase{phase="Succeeded"})', + "alerts_firing": 'sum(ALERTS{alertstate="firing"})', + "cpu_usage": f'sum(rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]))', + "mem_usage": 'sum(container_memory_working_set_bytes{namespace!=""})', + "net_io": ( + f'sum(rate(container_network_receive_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]) ' + f'+ rate(container_network_transmit_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]))' + ), + "fs_io": ( + f'sum(rate(container_fs_reads_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]) ' + f'+ rate(container_fs_writes_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]))' + ), + } + return {key: _scalar_trends(expr, _TREND_WINDOWS) for key, expr in exprs.items()} + + +def _node_condition_trends() -> dict[str, dict[str, dict[str, float | None]]]: + conditions = { + "ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})', + "not_ready": 'sum(kube_node_status_condition{condition="Ready",status="false"})', + "unschedulable": "sum(kube_node_spec_unschedulable)", + } + for cond in _PRESSURE_TYPES: + conditions[cond.lower()] = ( + f'sum(kube_node_status_condition{{condition="{cond}",status="true"}})' + ) + return {key: _scalar_trends(expr, _TREND_WINDOWS) for key, expr in conditions.items()} + + +def _pod_reason_totals( + reasons: dict[str, str], + series: str, +) -> dict[str, dict[str, dict[str, float | None]]]: + totals: dict[str, dict[str, dict[str, float | None]]] = {} + for key, reason in reasons.items(): + expr = f'sum({series}{{reason="{reason}"}})' + totals[key] = _scalar_trends(expr, _TREND_WINDOWS) + return totals + + def _node_usage_exprs() -> dict[str, str]: return { "cpu": ( @@ -2141,6 +2199,18 @@ def _collect_trend_metrics(metrics: dict[str, Any], errors: list[str]) -> None: metrics["pod_waiting_trends"] = _pod_waiting_trends() metrics["pod_terminated_now"] = _pod_terminated_now() metrics["pod_terminated_trends"] = _pod_terminated_trends() + metrics["cluster_trends"] = _cluster_trends() + metrics["node_condition_trends"] = _node_condition_trends() + metrics["pod_reason_totals"] = { + "waiting": _pod_reason_totals( + _POD_WAITING_REASONS, + "kube_pod_container_status_waiting_reason", + ), + "terminated": _pod_reason_totals( + _POD_TERMINATED_REASONS, + "kube_pod_container_status_terminated_reason", + ), + } except Exception as exc: errors.append(f"trends: {exc}") @@ -3298,6 +3368,9 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]: }, "pressure_summary": pressure_summary, "trend_summary": metrics.get("trend_summary"), + "time_series": metrics.get("cluster_trends"), + "node_condition_trends": metrics.get("node_condition_trends"), + "pod_reason_totals": metrics.get("pod_reason_totals"), "offenders": _build_offenders(metrics), "alerts": metrics.get("alerts", {}), "top": {