cluster state: expand time-series summaries
This commit is contained in:
parent
69fd48d45b
commit
8571ef6f4d
@ -1547,6 +1547,64 @@ def _vm_scalar_window(expr: str, window: str, fn: str) -> float | None:
|
||||
return _vm_scalar(f"{fn}(({expr})[{window}])")
|
||||
|
||||
|
||||
def _scalar_trends(expr: str, windows: tuple[str, ...]) -> dict[str, dict[str, float | None]]:
|
||||
return {
|
||||
window: {
|
||||
"avg": _vm_scalar_window(expr, window, "avg_over_time"),
|
||||
"min": _vm_scalar_window(expr, window, "min_over_time"),
|
||||
"max": _vm_scalar_window(expr, window, "max_over_time"),
|
||||
}
|
||||
for window in windows
|
||||
}
|
||||
|
||||
|
||||
def _cluster_trends() -> dict[str, dict[str, dict[str, float | None]]]:
|
||||
exprs = {
|
||||
"nodes_ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})',
|
||||
"nodes_not_ready": 'sum(kube_node_status_condition{condition="Ready",status="false"})',
|
||||
"pods_running": 'sum(kube_pod_status_phase{phase="Running"})',
|
||||
"pods_pending": 'sum(kube_pod_status_phase{phase="Pending"})',
|
||||
"pods_failed": 'sum(kube_pod_status_phase{phase="Failed"})',
|
||||
"pods_succeeded": 'sum(kube_pod_status_phase{phase="Succeeded"})',
|
||||
"alerts_firing": 'sum(ALERTS{alertstate="firing"})',
|
||||
"cpu_usage": f'sum(rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]))',
|
||||
"mem_usage": 'sum(container_memory_working_set_bytes{namespace!=""})',
|
||||
"net_io": (
|
||||
f'sum(rate(container_network_receive_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]) '
|
||||
f'+ rate(container_network_transmit_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]))'
|
||||
),
|
||||
"fs_io": (
|
||||
f'sum(rate(container_fs_reads_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]) '
|
||||
f'+ rate(container_fs_writes_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]))'
|
||||
),
|
||||
}
|
||||
return {key: _scalar_trends(expr, _TREND_WINDOWS) for key, expr in exprs.items()}
|
||||
|
||||
|
||||
def _node_condition_trends() -> dict[str, dict[str, dict[str, float | None]]]:
|
||||
conditions = {
|
||||
"ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})',
|
||||
"not_ready": 'sum(kube_node_status_condition{condition="Ready",status="false"})',
|
||||
"unschedulable": "sum(kube_node_spec_unschedulable)",
|
||||
}
|
||||
for cond in _PRESSURE_TYPES:
|
||||
conditions[cond.lower()] = (
|
||||
f'sum(kube_node_status_condition{{condition="{cond}",status="true"}})'
|
||||
)
|
||||
return {key: _scalar_trends(expr, _TREND_WINDOWS) for key, expr in conditions.items()}
|
||||
|
||||
|
||||
def _pod_reason_totals(
|
||||
reasons: dict[str, str],
|
||||
series: str,
|
||||
) -> dict[str, dict[str, dict[str, float | None]]]:
|
||||
totals: dict[str, dict[str, dict[str, float | None]]] = {}
|
||||
for key, reason in reasons.items():
|
||||
expr = f'sum({series}{{reason="{reason}"}})'
|
||||
totals[key] = _scalar_trends(expr, _TREND_WINDOWS)
|
||||
return totals
|
||||
|
||||
|
||||
def _node_usage_exprs() -> dict[str, str]:
|
||||
return {
|
||||
"cpu": (
|
||||
@ -2141,6 +2199,18 @@ def _collect_trend_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
|
||||
metrics["pod_waiting_trends"] = _pod_waiting_trends()
|
||||
metrics["pod_terminated_now"] = _pod_terminated_now()
|
||||
metrics["pod_terminated_trends"] = _pod_terminated_trends()
|
||||
metrics["cluster_trends"] = _cluster_trends()
|
||||
metrics["node_condition_trends"] = _node_condition_trends()
|
||||
metrics["pod_reason_totals"] = {
|
||||
"waiting": _pod_reason_totals(
|
||||
_POD_WAITING_REASONS,
|
||||
"kube_pod_container_status_waiting_reason",
|
||||
),
|
||||
"terminated": _pod_reason_totals(
|
||||
_POD_TERMINATED_REASONS,
|
||||
"kube_pod_container_status_terminated_reason",
|
||||
),
|
||||
}
|
||||
except Exception as exc:
|
||||
errors.append(f"trends: {exc}")
|
||||
|
||||
@ -3298,6 +3368,9 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
|
||||
},
|
||||
"pressure_summary": pressure_summary,
|
||||
"trend_summary": metrics.get("trend_summary"),
|
||||
"time_series": metrics.get("cluster_trends"),
|
||||
"node_condition_trends": metrics.get("node_condition_trends"),
|
||||
"pod_reason_totals": metrics.get("pod_reason_totals"),
|
||||
"offenders": _build_offenders(metrics),
|
||||
"alerts": metrics.get("alerts", {}),
|
||||
"top": {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user