cluster state: expand time-series summaries

This commit is contained in:
Brad Stein 2026-01-31 13:37:25 -03:00
parent 69fd48d45b
commit 8571ef6f4d

View File

@ -1547,6 +1547,64 @@ def _vm_scalar_window(expr: str, window: str, fn: str) -> float | None:
return _vm_scalar(f"{fn}(({expr})[{window}])") return _vm_scalar(f"{fn}(({expr})[{window}])")
def _scalar_trends(expr: str, windows: tuple[str, ...]) -> dict[str, dict[str, float | None]]:
return {
window: {
"avg": _vm_scalar_window(expr, window, "avg_over_time"),
"min": _vm_scalar_window(expr, window, "min_over_time"),
"max": _vm_scalar_window(expr, window, "max_over_time"),
}
for window in windows
}
def _cluster_trends() -> dict[str, dict[str, dict[str, float | None]]]:
exprs = {
"nodes_ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})',
"nodes_not_ready": 'sum(kube_node_status_condition{condition="Ready",status="false"})',
"pods_running": 'sum(kube_pod_status_phase{phase="Running"})',
"pods_pending": 'sum(kube_pod_status_phase{phase="Pending"})',
"pods_failed": 'sum(kube_pod_status_phase{phase="Failed"})',
"pods_succeeded": 'sum(kube_pod_status_phase{phase="Succeeded"})',
"alerts_firing": 'sum(ALERTS{alertstate="firing"})',
"cpu_usage": f'sum(rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]))',
"mem_usage": 'sum(container_memory_working_set_bytes{namespace!=""})',
"net_io": (
f'sum(rate(container_network_receive_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]) '
f'+ rate(container_network_transmit_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]))'
),
"fs_io": (
f'sum(rate(container_fs_reads_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]) '
f'+ rate(container_fs_writes_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]))'
),
}
return {key: _scalar_trends(expr, _TREND_WINDOWS) for key, expr in exprs.items()}
def _node_condition_trends() -> dict[str, dict[str, dict[str, float | None]]]:
conditions = {
"ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})',
"not_ready": 'sum(kube_node_status_condition{condition="Ready",status="false"})',
"unschedulable": "sum(kube_node_spec_unschedulable)",
}
for cond in _PRESSURE_TYPES:
conditions[cond.lower()] = (
f'sum(kube_node_status_condition{{condition="{cond}",status="true"}})'
)
return {key: _scalar_trends(expr, _TREND_WINDOWS) for key, expr in conditions.items()}
def _pod_reason_totals(
reasons: dict[str, str],
series: str,
) -> dict[str, dict[str, dict[str, float | None]]]:
totals: dict[str, dict[str, dict[str, float | None]]] = {}
for key, reason in reasons.items():
expr = f'sum({series}{{reason="{reason}"}})'
totals[key] = _scalar_trends(expr, _TREND_WINDOWS)
return totals
def _node_usage_exprs() -> dict[str, str]: def _node_usage_exprs() -> dict[str, str]:
return { return {
"cpu": ( "cpu": (
@ -2141,6 +2199,18 @@ def _collect_trend_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
metrics["pod_waiting_trends"] = _pod_waiting_trends() metrics["pod_waiting_trends"] = _pod_waiting_trends()
metrics["pod_terminated_now"] = _pod_terminated_now() metrics["pod_terminated_now"] = _pod_terminated_now()
metrics["pod_terminated_trends"] = _pod_terminated_trends() metrics["pod_terminated_trends"] = _pod_terminated_trends()
metrics["cluster_trends"] = _cluster_trends()
metrics["node_condition_trends"] = _node_condition_trends()
metrics["pod_reason_totals"] = {
"waiting": _pod_reason_totals(
_POD_WAITING_REASONS,
"kube_pod_container_status_waiting_reason",
),
"terminated": _pod_reason_totals(
_POD_TERMINATED_REASONS,
"kube_pod_container_status_terminated_reason",
),
}
except Exception as exc: except Exception as exc:
errors.append(f"trends: {exc}") errors.append(f"trends: {exc}")
@ -3298,6 +3368,9 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
}, },
"pressure_summary": pressure_summary, "pressure_summary": pressure_summary,
"trend_summary": metrics.get("trend_summary"), "trend_summary": metrics.get("trend_summary"),
"time_series": metrics.get("cluster_trends"),
"node_condition_trends": metrics.get("node_condition_trends"),
"pod_reason_totals": metrics.get("pod_reason_totals"),
"offenders": _build_offenders(metrics), "offenders": _build_offenders(metrics),
"alerts": metrics.get("alerts", {}), "alerts": metrics.get("alerts", {}),
"top": { "top": {