cluster state: expand time-series summaries
This commit is contained in:
parent
69fd48d45b
commit
8571ef6f4d
@ -1547,6 +1547,64 @@ def _vm_scalar_window(expr: str, window: str, fn: str) -> float | None:
|
|||||||
return _vm_scalar(f"{fn}(({expr})[{window}])")
|
return _vm_scalar(f"{fn}(({expr})[{window}])")
|
||||||
|
|
||||||
|
|
||||||
|
def _scalar_trends(expr: str, windows: tuple[str, ...]) -> dict[str, dict[str, float | None]]:
|
||||||
|
return {
|
||||||
|
window: {
|
||||||
|
"avg": _vm_scalar_window(expr, window, "avg_over_time"),
|
||||||
|
"min": _vm_scalar_window(expr, window, "min_over_time"),
|
||||||
|
"max": _vm_scalar_window(expr, window, "max_over_time"),
|
||||||
|
}
|
||||||
|
for window in windows
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _cluster_trends() -> dict[str, dict[str, dict[str, float | None]]]:
|
||||||
|
exprs = {
|
||||||
|
"nodes_ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})',
|
||||||
|
"nodes_not_ready": 'sum(kube_node_status_condition{condition="Ready",status="false"})',
|
||||||
|
"pods_running": 'sum(kube_pod_status_phase{phase="Running"})',
|
||||||
|
"pods_pending": 'sum(kube_pod_status_phase{phase="Pending"})',
|
||||||
|
"pods_failed": 'sum(kube_pod_status_phase{phase="Failed"})',
|
||||||
|
"pods_succeeded": 'sum(kube_pod_status_phase{phase="Succeeded"})',
|
||||||
|
"alerts_firing": 'sum(ALERTS{alertstate="firing"})',
|
||||||
|
"cpu_usage": f'sum(rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]))',
|
||||||
|
"mem_usage": 'sum(container_memory_working_set_bytes{namespace!=""})',
|
||||||
|
"net_io": (
|
||||||
|
f'sum(rate(container_network_receive_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]) '
|
||||||
|
f'+ rate(container_network_transmit_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]))'
|
||||||
|
),
|
||||||
|
"fs_io": (
|
||||||
|
f'sum(rate(container_fs_reads_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]) '
|
||||||
|
f'+ rate(container_fs_writes_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]))'
|
||||||
|
),
|
||||||
|
}
|
||||||
|
return {key: _scalar_trends(expr, _TREND_WINDOWS) for key, expr in exprs.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def _node_condition_trends() -> dict[str, dict[str, dict[str, float | None]]]:
|
||||||
|
conditions = {
|
||||||
|
"ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})',
|
||||||
|
"not_ready": 'sum(kube_node_status_condition{condition="Ready",status="false"})',
|
||||||
|
"unschedulable": "sum(kube_node_spec_unschedulable)",
|
||||||
|
}
|
||||||
|
for cond in _PRESSURE_TYPES:
|
||||||
|
conditions[cond.lower()] = (
|
||||||
|
f'sum(kube_node_status_condition{{condition="{cond}",status="true"}})'
|
||||||
|
)
|
||||||
|
return {key: _scalar_trends(expr, _TREND_WINDOWS) for key, expr in conditions.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def _pod_reason_totals(
|
||||||
|
reasons: dict[str, str],
|
||||||
|
series: str,
|
||||||
|
) -> dict[str, dict[str, dict[str, float | None]]]:
|
||||||
|
totals: dict[str, dict[str, dict[str, float | None]]] = {}
|
||||||
|
for key, reason in reasons.items():
|
||||||
|
expr = f'sum({series}{{reason="{reason}"}})'
|
||||||
|
totals[key] = _scalar_trends(expr, _TREND_WINDOWS)
|
||||||
|
return totals
|
||||||
|
|
||||||
|
|
||||||
def _node_usage_exprs() -> dict[str, str]:
|
def _node_usage_exprs() -> dict[str, str]:
|
||||||
return {
|
return {
|
||||||
"cpu": (
|
"cpu": (
|
||||||
@ -2141,6 +2199,18 @@ def _collect_trend_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
|
|||||||
metrics["pod_waiting_trends"] = _pod_waiting_trends()
|
metrics["pod_waiting_trends"] = _pod_waiting_trends()
|
||||||
metrics["pod_terminated_now"] = _pod_terminated_now()
|
metrics["pod_terminated_now"] = _pod_terminated_now()
|
||||||
metrics["pod_terminated_trends"] = _pod_terminated_trends()
|
metrics["pod_terminated_trends"] = _pod_terminated_trends()
|
||||||
|
metrics["cluster_trends"] = _cluster_trends()
|
||||||
|
metrics["node_condition_trends"] = _node_condition_trends()
|
||||||
|
metrics["pod_reason_totals"] = {
|
||||||
|
"waiting": _pod_reason_totals(
|
||||||
|
_POD_WAITING_REASONS,
|
||||||
|
"kube_pod_container_status_waiting_reason",
|
||||||
|
),
|
||||||
|
"terminated": _pod_reason_totals(
|
||||||
|
_POD_TERMINATED_REASONS,
|
||||||
|
"kube_pod_container_status_terminated_reason",
|
||||||
|
),
|
||||||
|
}
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
errors.append(f"trends: {exc}")
|
errors.append(f"trends: {exc}")
|
||||||
|
|
||||||
@ -3298,6 +3368,9 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
|
|||||||
},
|
},
|
||||||
"pressure_summary": pressure_summary,
|
"pressure_summary": pressure_summary,
|
||||||
"trend_summary": metrics.get("trend_summary"),
|
"trend_summary": metrics.get("trend_summary"),
|
||||||
|
"time_series": metrics.get("cluster_trends"),
|
||||||
|
"node_condition_trends": metrics.get("node_condition_trends"),
|
||||||
|
"pod_reason_totals": metrics.get("pod_reason_totals"),
|
||||||
"offenders": _build_offenders(metrics),
|
"offenders": _build_offenders(metrics),
|
||||||
"alerts": metrics.get("alerts", {}),
|
"alerts": metrics.get("alerts", {}),
|
||||||
"top": {
|
"top": {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user