cluster_state: add pressure and pod issues
This commit is contained in:
parent
30a9377594
commit
ef756ff1fa
@ -41,6 +41,23 @@ _SYSTEM_NAMESPACES = {
|
|||||||
_WORKLOAD_ALLOWED_NAMESPACES = {
|
_WORKLOAD_ALLOWED_NAMESPACES = {
|
||||||
"maintenance",
|
"maintenance",
|
||||||
}
|
}
|
||||||
|
_CAPACITY_KEYS = {
|
||||||
|
"cpu",
|
||||||
|
"memory",
|
||||||
|
"pods",
|
||||||
|
"ephemeral-storage",
|
||||||
|
}
|
||||||
|
_PRESSURE_TYPES = {
|
||||||
|
"MemoryPressure",
|
||||||
|
"DiskPressure",
|
||||||
|
"PIDPressure",
|
||||||
|
"NetworkUnavailable",
|
||||||
|
}
|
||||||
|
_PHASE_SEVERITY = {
|
||||||
|
"Failed": 3,
|
||||||
|
"Pending": 2,
|
||||||
|
"Unknown": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@ -135,6 +152,7 @@ def _node_details(payload: dict[str, Any]) -> list[dict[str, Any]]:
|
|||||||
if not name:
|
if not name:
|
||||||
continue
|
continue
|
||||||
roles = _node_roles(labels)
|
roles = _node_roles(labels)
|
||||||
|
conditions = _node_pressure_conditions(status.get("conditions"))
|
||||||
details.append(
|
details.append(
|
||||||
{
|
{
|
||||||
"name": name,
|
"name": name,
|
||||||
@ -149,6 +167,9 @@ def _node_details(payload: dict[str, Any]) -> list[dict[str, Any]]:
|
|||||||
"kubelet": node_info.get("kubeletVersion") or "",
|
"kubelet": node_info.get("kubeletVersion") or "",
|
||||||
"container_runtime": node_info.get("containerRuntimeVersion") or "",
|
"container_runtime": node_info.get("containerRuntimeVersion") or "",
|
||||||
"addresses": _node_addresses(status),
|
"addresses": _node_addresses(status),
|
||||||
|
"capacity": _node_capacity(status.get("capacity")),
|
||||||
|
"allocatable": _node_capacity(status.get("allocatable")),
|
||||||
|
"pressure": conditions,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
details.sort(key=lambda item: item.get("name") or "")
|
details.sort(key=lambda item: item.get("name") or "")
|
||||||
@ -164,18 +185,28 @@ def _summarize_inventory(details: list[dict[str, Any]]) -> dict[str, Any]:
|
|||||||
"by_arch": {},
|
"by_arch": {},
|
||||||
"by_role": {},
|
"by_role": {},
|
||||||
"not_ready_names": [],
|
"not_ready_names": [],
|
||||||
|
"pressure_nodes": {key: [] for key in _PRESSURE_TYPES},
|
||||||
}
|
}
|
||||||
not_ready: list[str] = []
|
not_ready: list[str] = []
|
||||||
for node in details:
|
for node in details:
|
||||||
|
name = _apply_node_summary(summary, node)
|
||||||
|
if name and not node.get("ready"):
|
||||||
|
not_ready.append(name)
|
||||||
|
not_ready.sort()
|
||||||
|
summary["not_ready_names"] = not_ready
|
||||||
|
for cond_type in summary["pressure_nodes"]:
|
||||||
|
summary["pressure_nodes"][cond_type].sort()
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_node_summary(summary: dict[str, Any], node: dict[str, Any]) -> str:
|
||||||
name = node.get("name") if isinstance(node, dict) else ""
|
name = node.get("name") if isinstance(node, dict) else ""
|
||||||
if not isinstance(name, str) or not name:
|
if not isinstance(name, str) or not name:
|
||||||
continue
|
return ""
|
||||||
summary["total"] += 1
|
summary["total"] += 1
|
||||||
ready = bool(node.get("ready"))
|
ready = bool(node.get("ready"))
|
||||||
if ready:
|
if ready:
|
||||||
summary["ready"] += 1
|
summary["ready"] += 1
|
||||||
else:
|
|
||||||
not_ready.append(name)
|
|
||||||
if node.get("is_worker"):
|
if node.get("is_worker"):
|
||||||
summary["workers"]["total"] += 1
|
summary["workers"]["total"] += 1
|
||||||
if ready:
|
if ready:
|
||||||
@ -186,9 +217,41 @@ def _summarize_inventory(details: list[dict[str, Any]]) -> dict[str, Any]:
|
|||||||
summary["by_arch"][arch] = summary["by_arch"].get(arch, 0) + 1
|
summary["by_arch"][arch] = summary["by_arch"].get(arch, 0) + 1
|
||||||
for role in node.get("roles") or []:
|
for role in node.get("roles") or []:
|
||||||
summary["by_role"][role] = summary["by_role"].get(role, 0) + 1
|
summary["by_role"][role] = summary["by_role"].get(role, 0) + 1
|
||||||
not_ready.sort()
|
_apply_pressure(summary, node, name)
|
||||||
summary["not_ready_names"] = not_ready
|
return name
|
||||||
return summary
|
|
||||||
|
|
||||||
|
def _apply_pressure(summary: dict[str, Any], node: dict[str, Any], name: str) -> None:
|
||||||
|
pressure = node.get("pressure") or {}
|
||||||
|
if not isinstance(pressure, dict):
|
||||||
|
return
|
||||||
|
for cond_type, active in pressure.items():
|
||||||
|
if active and cond_type in summary["pressure_nodes"]:
|
||||||
|
summary["pressure_nodes"][cond_type].append(name)
|
||||||
|
|
||||||
|
|
||||||
|
def _node_capacity(raw: Any) -> dict[str, str]:
|
||||||
|
if not isinstance(raw, dict):
|
||||||
|
return {}
|
||||||
|
output: dict[str, str] = {}
|
||||||
|
for key in _CAPACITY_KEYS:
|
||||||
|
value = raw.get(key)
|
||||||
|
if isinstance(value, (str, int, float)) and value != "":
|
||||||
|
output[key] = str(value)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def _node_pressure_conditions(conditions: Any) -> dict[str, bool]:
|
||||||
|
if not isinstance(conditions, list):
|
||||||
|
return {}
|
||||||
|
pressure: dict[str, bool] = {}
|
||||||
|
for condition in conditions:
|
||||||
|
if not isinstance(condition, dict):
|
||||||
|
continue
|
||||||
|
cond_type = condition.get("type")
|
||||||
|
if cond_type in _PRESSURE_TYPES:
|
||||||
|
pressure[cond_type] = condition.get("status") == "True"
|
||||||
|
return pressure
|
||||||
|
|
||||||
|
|
||||||
def _node_roles(labels: dict[str, Any]) -> list[str]:
|
def _node_roles(labels: dict[str, Any]) -> list[str]:
|
||||||
@ -508,6 +571,54 @@ def _node_pod_finalize(nodes: dict[str, dict[str, Any]]) -> list[dict[str, Any]]
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
items: list[dict[str, Any]] = []
|
||||||
|
counts: dict[str, int] = {key: 0 for key in _PHASE_SEVERITY}
|
||||||
|
for pod in _items(payload):
|
||||||
|
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
|
||||||
|
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
|
||||||
|
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
|
||||||
|
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
|
||||||
|
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
|
||||||
|
if not name or not namespace:
|
||||||
|
continue
|
||||||
|
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
|
||||||
|
restarts = 0
|
||||||
|
waiting_reasons: list[str] = []
|
||||||
|
for container in status.get("containerStatuses") or []:
|
||||||
|
if not isinstance(container, dict):
|
||||||
|
continue
|
||||||
|
restarts += int(container.get("restartCount") or 0)
|
||||||
|
state = container.get("state") if isinstance(container.get("state"), dict) else {}
|
||||||
|
waiting = state.get("waiting") if isinstance(state.get("waiting"), dict) else {}
|
||||||
|
reason = waiting.get("reason")
|
||||||
|
if isinstance(reason, str) and reason:
|
||||||
|
waiting_reasons.append(reason)
|
||||||
|
if phase in counts:
|
||||||
|
counts[phase] += 1
|
||||||
|
if phase in _PHASE_SEVERITY or restarts > 0:
|
||||||
|
items.append(
|
||||||
|
{
|
||||||
|
"namespace": namespace,
|
||||||
|
"pod": name,
|
||||||
|
"node": spec.get("nodeName") or "",
|
||||||
|
"phase": phase,
|
||||||
|
"reason": status.get("reason") or "",
|
||||||
|
"restarts": restarts,
|
||||||
|
"waiting_reasons": sorted(set(waiting_reasons)),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
items.sort(
|
||||||
|
key=lambda item: (
|
||||||
|
-_PHASE_SEVERITY.get(item.get("phase") or "", 0),
|
||||||
|
-(item.get("restarts") or 0),
|
||||||
|
item.get("namespace") or "",
|
||||||
|
item.get("pod") or "",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return {"counts": counts, "items": items[:20]}
|
||||||
|
|
||||||
|
|
||||||
def _vm_query(expr: str) -> list[dict[str, Any]] | None:
|
def _vm_query(expr: str) -> list[dict[str, Any]] | None:
|
||||||
base = settings.vm_url
|
base = settings.vm_url
|
||||||
if not base:
|
if not base:
|
||||||
@ -689,6 +800,12 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
|||||||
metrics["nodes_ready"] = _vm_scalar(
|
metrics["nodes_ready"] = _vm_scalar(
|
||||||
"count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})"
|
"count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})"
|
||||||
)
|
)
|
||||||
|
metrics["capacity_cpu"] = _vm_scalar("sum(kube_node_status_capacity_cpu_cores)")
|
||||||
|
metrics["allocatable_cpu"] = _vm_scalar("sum(kube_node_status_allocatable_cpu_cores)")
|
||||||
|
metrics["capacity_mem_bytes"] = _vm_scalar("sum(kube_node_status_capacity_memory_bytes)")
|
||||||
|
metrics["allocatable_mem_bytes"] = _vm_scalar("sum(kube_node_status_allocatable_memory_bytes)")
|
||||||
|
metrics["capacity_pods"] = _vm_scalar("sum(kube_node_status_capacity_pods)")
|
||||||
|
metrics["allocatable_pods"] = _vm_scalar("sum(kube_node_status_allocatable_pods)")
|
||||||
metrics["pods_running"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Running\"})")
|
metrics["pods_running"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Running\"})")
|
||||||
metrics["pods_pending"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Pending\"})")
|
metrics["pods_pending"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Pending\"})")
|
||||||
metrics["pods_failed"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Failed\"})")
|
metrics["pods_failed"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Failed\"})")
|
||||||
@ -728,6 +845,12 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
|||||||
"restarts": "count",
|
"restarts": "count",
|
||||||
"namespace_cpu": "cores",
|
"namespace_cpu": "cores",
|
||||||
"namespace_mem": "bytes",
|
"namespace_mem": "bytes",
|
||||||
|
"capacity_cpu": "cores",
|
||||||
|
"allocatable_cpu": "cores",
|
||||||
|
"capacity_mem_bytes": "bytes",
|
||||||
|
"allocatable_mem_bytes": "bytes",
|
||||||
|
"capacity_pods": "count",
|
||||||
|
"allocatable_pods": "count",
|
||||||
}
|
}
|
||||||
metrics["windows"] = {
|
metrics["windows"] = {
|
||||||
"rates": _RATE_WINDOW,
|
"rates": _RATE_WINDOW,
|
||||||
@ -764,12 +887,14 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
|
|||||||
namespace_pods: list[dict[str, Any]] = []
|
namespace_pods: list[dict[str, Any]] = []
|
||||||
namespace_nodes: list[dict[str, Any]] = []
|
namespace_nodes: list[dict[str, Any]] = []
|
||||||
node_pods: list[dict[str, Any]] = []
|
node_pods: list[dict[str, Any]] = []
|
||||||
|
pod_issues: dict[str, Any] = {}
|
||||||
try:
|
try:
|
||||||
pods_payload = get_json("/api/v1/pods?limit=5000")
|
pods_payload = get_json("/api/v1/pods?limit=5000")
|
||||||
workloads = _summarize_workloads(pods_payload)
|
workloads = _summarize_workloads(pods_payload)
|
||||||
namespace_pods = _summarize_namespace_pods(pods_payload)
|
namespace_pods = _summarize_namespace_pods(pods_payload)
|
||||||
namespace_nodes = _summarize_namespace_nodes(pods_payload)
|
namespace_nodes = _summarize_namespace_nodes(pods_payload)
|
||||||
node_pods = _summarize_node_pods(pods_payload)
|
node_pods = _summarize_node_pods(pods_payload)
|
||||||
|
pod_issues = _summarize_pod_issues(pods_payload)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
errors.append(f"pods: {exc}")
|
errors.append(f"pods: {exc}")
|
||||||
|
|
||||||
@ -785,6 +910,7 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
|
|||||||
"namespace_pods": namespace_pods,
|
"namespace_pods": namespace_pods,
|
||||||
"namespace_nodes": namespace_nodes,
|
"namespace_nodes": namespace_nodes,
|
||||||
"node_pods": node_pods,
|
"node_pods": node_pods,
|
||||||
|
"pod_issues": pod_issues,
|
||||||
"metrics": metrics,
|
"metrics": metrics,
|
||||||
"errors": errors,
|
"errors": errors,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -78,12 +78,14 @@ def test_collect_cluster_state(monkeypatch) -> None:
|
|||||||
assert snapshot["flux"]["not_ready"] == 1
|
assert snapshot["flux"]["not_ready"] == 1
|
||||||
assert snapshot["nodes_summary"]["total"] == 2
|
assert snapshot["nodes_summary"]["total"] == 2
|
||||||
assert snapshot["nodes_summary"]["ready"] == 1
|
assert snapshot["nodes_summary"]["ready"] == 1
|
||||||
|
assert "pressure_nodes" in snapshot["nodes_summary"]
|
||||||
assert snapshot["nodes_detail"]
|
assert snapshot["nodes_detail"]
|
||||||
assert snapshot["workloads"]
|
assert snapshot["workloads"]
|
||||||
assert snapshot["namespace_pods"]
|
assert snapshot["namespace_pods"]
|
||||||
assert snapshot["namespace_pods"][0]["namespace"] == "media"
|
assert snapshot["namespace_pods"][0]["namespace"] == "media"
|
||||||
assert snapshot["namespace_nodes"]
|
assert snapshot["namespace_nodes"]
|
||||||
assert snapshot["node_pods"]
|
assert snapshot["node_pods"]
|
||||||
|
assert "pod_issues" in snapshot
|
||||||
assert "node_usage_stats" in snapshot["metrics"]
|
assert "node_usage_stats" in snapshot["metrics"]
|
||||||
assert snapshot["metrics"]["namespace_cpu_top"] == []
|
assert snapshot["metrics"]["namespace_cpu_top"] == []
|
||||||
assert snapshot["metrics"]["namespace_mem_top"] == []
|
assert snapshot["metrics"]["namespace_mem_top"] == []
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user