Compare commits
10 Commits
558f5c1270
...
cacaaaad53
| Author | SHA1 | Date | |
|---|---|---|---|
| cacaaaad53 | |||
| ad75df7444 | |||
| d1b1687a92 | |||
| 36a16f00b7 | |||
| e73c1a4e1c | |||
| 281118b810 | |||
| 2370aa4e5d | |||
| e809f0b8bd | |||
| ef756ff1fa | |||
| 30a9377594 |
@ -41,6 +41,25 @@ _SYSTEM_NAMESPACES = {
|
||||
_WORKLOAD_ALLOWED_NAMESPACES = {
|
||||
"maintenance",
|
||||
}
|
||||
_CAPACITY_KEYS = {
|
||||
"cpu",
|
||||
"memory",
|
||||
"pods",
|
||||
"ephemeral-storage",
|
||||
}
|
||||
_PRESSURE_TYPES = {
|
||||
"MemoryPressure",
|
||||
"DiskPressure",
|
||||
"PIDPressure",
|
||||
"NetworkUnavailable",
|
||||
}
|
||||
_EVENTS_MAX = 20
|
||||
_EVENT_WARNING = "Warning"
|
||||
_PHASE_SEVERITY = {
|
||||
"Failed": 3,
|
||||
"Pending": 2,
|
||||
"Unknown": 1,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@ -128,6 +147,7 @@ def _node_details(payload: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
details: list[dict[str, Any]] = []
|
||||
for node in _items(payload):
|
||||
metadata = node.get("metadata") if isinstance(node.get("metadata"), dict) else {}
|
||||
spec = node.get("spec") if isinstance(node.get("spec"), dict) else {}
|
||||
status = node.get("status") if isinstance(node.get("status"), dict) else {}
|
||||
node_info = status.get("nodeInfo") if isinstance(status.get("nodeInfo"), dict) else {}
|
||||
labels = metadata.get("labels") if isinstance(metadata.get("labels"), dict) else {}
|
||||
@ -135,6 +155,9 @@ def _node_details(payload: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
if not name:
|
||||
continue
|
||||
roles = _node_roles(labels)
|
||||
conditions = _node_pressure_conditions(status.get("conditions"))
|
||||
created_at = metadata.get("creationTimestamp") if isinstance(metadata.get("creationTimestamp"), str) else ""
|
||||
taints = _node_taints(spec.get("taints"))
|
||||
details.append(
|
||||
{
|
||||
"name": name,
|
||||
@ -149,12 +172,50 @@ def _node_details(payload: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
"kubelet": node_info.get("kubeletVersion") or "",
|
||||
"container_runtime": node_info.get("containerRuntimeVersion") or "",
|
||||
"addresses": _node_addresses(status),
|
||||
"created_at": created_at,
|
||||
"age_hours": _age_hours(created_at),
|
||||
"taints": taints,
|
||||
"unschedulable": bool(spec.get("unschedulable")),
|
||||
"capacity": _node_capacity(status.get("capacity")),
|
||||
"allocatable": _node_capacity(status.get("allocatable")),
|
||||
"pressure": conditions,
|
||||
}
|
||||
)
|
||||
details.sort(key=lambda item: item.get("name") or "")
|
||||
return details
|
||||
|
||||
|
||||
def _age_hours(timestamp: str) -> float | None:
|
||||
if not timestamp:
|
||||
return None
|
||||
try:
|
||||
parsed = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
return round((datetime.now(timezone.utc) - parsed).total_seconds() / 3600, 1)
|
||||
|
||||
|
||||
def _node_taints(raw: Any) -> list[dict[str, str]]:
|
||||
if not isinstance(raw, list):
|
||||
return []
|
||||
taints: list[dict[str, str]] = []
|
||||
for entry in raw:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
key = entry.get("key")
|
||||
effect = entry.get("effect")
|
||||
value = entry.get("value")
|
||||
if isinstance(key, str) and isinstance(effect, str):
|
||||
taints.append(
|
||||
{
|
||||
"key": key,
|
||||
"value": value if isinstance(value, str) else "",
|
||||
"effect": effect,
|
||||
}
|
||||
)
|
||||
return taints
|
||||
|
||||
|
||||
def _summarize_inventory(details: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
summary = {
|
||||
"total": 0,
|
||||
@ -164,33 +225,75 @@ def _summarize_inventory(details: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
"by_arch": {},
|
||||
"by_role": {},
|
||||
"not_ready_names": [],
|
||||
"pressure_nodes": {key: [] for key in _PRESSURE_TYPES},
|
||||
}
|
||||
not_ready: list[str] = []
|
||||
for node in details:
|
||||
name = node.get("name") if isinstance(node, dict) else ""
|
||||
if not isinstance(name, str) or not name:
|
||||
continue
|
||||
summary["total"] += 1
|
||||
ready = bool(node.get("ready"))
|
||||
if ready:
|
||||
summary["ready"] += 1
|
||||
else:
|
||||
name = _apply_node_summary(summary, node)
|
||||
if name and not node.get("ready"):
|
||||
not_ready.append(name)
|
||||
if node.get("is_worker"):
|
||||
summary["workers"]["total"] += 1
|
||||
if ready:
|
||||
summary["workers"]["ready"] += 1
|
||||
hardware = node.get("hardware") or "unknown"
|
||||
arch = node.get("arch") or "unknown"
|
||||
summary["by_hardware"][hardware] = summary["by_hardware"].get(hardware, 0) + 1
|
||||
summary["by_arch"][arch] = summary["by_arch"].get(arch, 0) + 1
|
||||
for role in node.get("roles") or []:
|
||||
summary["by_role"][role] = summary["by_role"].get(role, 0) + 1
|
||||
not_ready.sort()
|
||||
summary["not_ready_names"] = not_ready
|
||||
for cond_type in summary["pressure_nodes"]:
|
||||
summary["pressure_nodes"][cond_type].sort()
|
||||
return summary
|
||||
|
||||
|
||||
def _apply_node_summary(summary: dict[str, Any], node: dict[str, Any]) -> str:
|
||||
name = node.get("name") if isinstance(node, dict) else ""
|
||||
if not isinstance(name, str) or not name:
|
||||
return ""
|
||||
summary["total"] += 1
|
||||
ready = bool(node.get("ready"))
|
||||
if ready:
|
||||
summary["ready"] += 1
|
||||
if node.get("is_worker"):
|
||||
summary["workers"]["total"] += 1
|
||||
if ready:
|
||||
summary["workers"]["ready"] += 1
|
||||
hardware = node.get("hardware") or "unknown"
|
||||
arch = node.get("arch") or "unknown"
|
||||
summary["by_hardware"][hardware] = summary["by_hardware"].get(hardware, 0) + 1
|
||||
summary["by_arch"][arch] = summary["by_arch"].get(arch, 0) + 1
|
||||
for role in node.get("roles") or []:
|
||||
summary["by_role"][role] = summary["by_role"].get(role, 0) + 1
|
||||
_apply_pressure(summary, node, name)
|
||||
return name
|
||||
|
||||
|
||||
def _apply_pressure(summary: dict[str, Any], node: dict[str, Any], name: str) -> None:
|
||||
pressure = node.get("pressure") or {}
|
||||
if not isinstance(pressure, dict):
|
||||
return
|
||||
for cond_type, active in pressure.items():
|
||||
if active and cond_type in summary["pressure_nodes"]:
|
||||
summary["pressure_nodes"][cond_type].append(name)
|
||||
|
||||
|
||||
def _node_capacity(raw: Any) -> dict[str, str]:
|
||||
if not isinstance(raw, dict):
|
||||
return {}
|
||||
output: dict[str, str] = {}
|
||||
for key in _CAPACITY_KEYS:
|
||||
value = raw.get(key)
|
||||
if isinstance(value, (str, int, float)) and value != "":
|
||||
output[key] = str(value)
|
||||
return output
|
||||
|
||||
|
||||
def _node_pressure_conditions(conditions: Any) -> dict[str, bool]:
|
||||
if not isinstance(conditions, list):
|
||||
return {}
|
||||
pressure: dict[str, bool] = {}
|
||||
for condition in conditions:
|
||||
if not isinstance(condition, dict):
|
||||
continue
|
||||
cond_type = condition.get("type")
|
||||
if cond_type in _PRESSURE_TYPES:
|
||||
pressure[cond_type] = condition.get("status") == "True"
|
||||
return pressure
|
||||
|
||||
|
||||
def _node_roles(labels: dict[str, Any]) -> list[str]:
|
||||
roles: list[str] = []
|
||||
for key in labels.keys():
|
||||
@ -291,6 +394,67 @@ def _namespace_allowed(namespace: str) -> bool:
|
||||
return namespace not in _SYSTEM_NAMESPACES
|
||||
|
||||
|
||||
def _event_timestamp(event: dict[str, Any]) -> str:
|
||||
for key in ("eventTime", "lastTimestamp", "firstTimestamp"):
|
||||
value = event.get(key)
|
||||
if isinstance(value, str) and value:
|
||||
return value
|
||||
return ""
|
||||
|
||||
|
||||
def _event_sort_key(timestamp: str) -> float:
|
||||
if not timestamp:
|
||||
return 0.0
|
||||
try:
|
||||
return datetime.fromisoformat(timestamp.replace("Z", "+00:00")).timestamp()
|
||||
except ValueError:
|
||||
return 0.0
|
||||
|
||||
|
||||
def _summarize_events(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
warnings: list[dict[str, Any]] = []
|
||||
by_reason: dict[str, int] = {}
|
||||
by_namespace: dict[str, int] = {}
|
||||
for event in _items(payload):
|
||||
metadata = event.get("metadata") if isinstance(event.get("metadata"), dict) else {}
|
||||
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
|
||||
if not _namespace_allowed(namespace):
|
||||
continue
|
||||
event_type = event.get("type") if isinstance(event.get("type"), str) else ""
|
||||
if event_type != _EVENT_WARNING:
|
||||
continue
|
||||
reason = event.get("reason") if isinstance(event.get("reason"), str) else ""
|
||||
message = event.get("message") if isinstance(event.get("message"), str) else ""
|
||||
count = event.get("count") if isinstance(event.get("count"), int) else 1
|
||||
involved = (
|
||||
event.get("involvedObject") if isinstance(event.get("involvedObject"), dict) else {}
|
||||
)
|
||||
timestamp = _event_timestamp(event)
|
||||
warnings.append(
|
||||
{
|
||||
"namespace": namespace,
|
||||
"reason": reason,
|
||||
"message": message,
|
||||
"count": count,
|
||||
"last_seen": timestamp,
|
||||
"object_kind": involved.get("kind") or "",
|
||||
"object_name": involved.get("name") or "",
|
||||
}
|
||||
)
|
||||
if reason:
|
||||
by_reason[reason] = by_reason.get(reason, 0) + count
|
||||
if namespace:
|
||||
by_namespace[namespace] = by_namespace.get(namespace, 0) + count
|
||||
warnings.sort(key=lambda item: _event_sort_key(item.get("last_seen") or ""), reverse=True)
|
||||
top = warnings[:_EVENTS_MAX]
|
||||
return {
|
||||
"warnings_total": len(warnings),
|
||||
"warnings_by_reason": by_reason,
|
||||
"warnings_by_namespace": by_namespace,
|
||||
"warnings_recent": top,
|
||||
}
|
||||
|
||||
|
||||
def _workload_from_labels(labels: dict[str, Any]) -> tuple[str, str]:
|
||||
for key in _WORKLOAD_LABEL_KEYS:
|
||||
value = labels.get(key)
|
||||
@ -397,6 +561,344 @@ def _summarize_namespace_pods(payload: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
return output
|
||||
|
||||
|
||||
def _summarize_namespace_nodes(payload: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
namespaces: dict[str, dict[str, Any]] = {}
|
||||
for pod in _items(payload):
|
||||
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
|
||||
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
|
||||
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
|
||||
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
|
||||
if not _namespace_allowed(namespace):
|
||||
continue
|
||||
node = spec.get("nodeName") if isinstance(spec.get("nodeName"), str) else ""
|
||||
if not node:
|
||||
continue
|
||||
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
|
||||
entry = namespaces.setdefault(
|
||||
namespace,
|
||||
{
|
||||
"namespace": namespace,
|
||||
"pods_total": 0,
|
||||
"pods_running": 0,
|
||||
"nodes": {},
|
||||
},
|
||||
)
|
||||
entry["pods_total"] += 1
|
||||
if phase == "Running":
|
||||
entry["pods_running"] += 1
|
||||
nodes = entry["nodes"]
|
||||
nodes[node] = nodes.get(node, 0) + 1
|
||||
output: list[dict[str, Any]] = []
|
||||
for entry in namespaces.values():
|
||||
nodes = entry.get("nodes") or {}
|
||||
primary = ""
|
||||
if isinstance(nodes, dict) and nodes:
|
||||
primary = sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[0][0]
|
||||
entry["primary_node"] = primary
|
||||
output.append(entry)
|
||||
output.sort(key=lambda item: (-item.get("pods_total", 0), item.get("namespace") or ""))
|
||||
return output
|
||||
|
||||
|
||||
_NODE_PHASE_KEYS = {
|
||||
"Running": "pods_running",
|
||||
"Pending": "pods_pending",
|
||||
"Failed": "pods_failed",
|
||||
"Succeeded": "pods_succeeded",
|
||||
}
|
||||
|
||||
|
||||
def _summarize_node_pods(payload: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
nodes: dict[str, dict[str, Any]] = {}
|
||||
for pod in _items(payload):
|
||||
context = _node_pod_context(pod)
|
||||
if not context:
|
||||
continue
|
||||
node, namespace, phase = context
|
||||
entry = _node_pod_entry(nodes, node)
|
||||
_node_pod_apply(entry, namespace, phase)
|
||||
return _node_pod_finalize(nodes)
|
||||
|
||||
|
||||
def _node_pod_context(pod: dict[str, Any]) -> tuple[str, str, str] | None:
|
||||
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
|
||||
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
|
||||
if not _namespace_allowed(namespace):
|
||||
return None
|
||||
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
|
||||
node = spec.get("nodeName") if isinstance(spec.get("nodeName"), str) else ""
|
||||
if not node:
|
||||
return None
|
||||
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
|
||||
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
|
||||
return node, namespace, phase
|
||||
|
||||
|
||||
def _node_pod_entry(nodes: dict[str, dict[str, Any]], node: str) -> dict[str, Any]:
|
||||
return nodes.setdefault(
|
||||
node,
|
||||
{
|
||||
"node": node,
|
||||
"pods_total": 0,
|
||||
"pods_running": 0,
|
||||
"pods_pending": 0,
|
||||
"pods_failed": 0,
|
||||
"pods_succeeded": 0,
|
||||
"namespaces": {},
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _node_pod_apply(entry: dict[str, Any], namespace: str, phase: str) -> None:
|
||||
entry["pods_total"] += 1
|
||||
phase_key = _NODE_PHASE_KEYS.get(phase)
|
||||
if phase_key:
|
||||
entry[phase_key] += 1
|
||||
if namespace:
|
||||
namespaces = entry["namespaces"]
|
||||
namespaces[namespace] = namespaces.get(namespace, 0) + 1
|
||||
|
||||
|
||||
def _node_pod_finalize(nodes: dict[str, dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
output: list[dict[str, Any]] = []
|
||||
for entry in nodes.values():
|
||||
namespaces = entry.get("namespaces") or {}
|
||||
if isinstance(namespaces, dict):
|
||||
entry["namespaces_top"] = sorted(
|
||||
namespaces.items(), key=lambda item: (-item[1], item[0])
|
||||
)[:3]
|
||||
output.append(entry)
|
||||
output.sort(key=lambda item: (-item.get("pods_total", 0), item.get("node") or ""))
|
||||
return output
|
||||
|
||||
|
||||
def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
items: list[dict[str, Any]] = []
|
||||
counts: dict[str, int] = {key: 0 for key in _PHASE_SEVERITY}
|
||||
for pod in _items(payload):
|
||||
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
|
||||
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
|
||||
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
|
||||
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
|
||||
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
|
||||
if not name or not namespace:
|
||||
continue
|
||||
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
|
||||
restarts = 0
|
||||
waiting_reasons: list[str] = []
|
||||
for container in status.get("containerStatuses") or []:
|
||||
if not isinstance(container, dict):
|
||||
continue
|
||||
restarts += int(container.get("restartCount") or 0)
|
||||
state = container.get("state") if isinstance(container.get("state"), dict) else {}
|
||||
waiting = state.get("waiting") if isinstance(state.get("waiting"), dict) else {}
|
||||
reason = waiting.get("reason")
|
||||
if isinstance(reason, str) and reason:
|
||||
waiting_reasons.append(reason)
|
||||
if phase in counts:
|
||||
counts[phase] += 1
|
||||
if phase in _PHASE_SEVERITY or restarts > 0:
|
||||
items.append(
|
||||
{
|
||||
"namespace": namespace,
|
||||
"pod": name,
|
||||
"node": spec.get("nodeName") or "",
|
||||
"phase": phase,
|
||||
"reason": status.get("reason") or "",
|
||||
"restarts": restarts,
|
||||
"waiting_reasons": sorted(set(waiting_reasons)),
|
||||
}
|
||||
)
|
||||
items.sort(
|
||||
key=lambda item: (
|
||||
-_PHASE_SEVERITY.get(item.get("phase") or "", 0),
|
||||
-(item.get("restarts") or 0),
|
||||
item.get("namespace") or "",
|
||||
item.get("pod") or "",
|
||||
)
|
||||
)
|
||||
return {"counts": counts, "items": items[:20]}
|
||||
|
||||
|
||||
def _summarize_deployments(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
items = _items(payload)
|
||||
unhealthy: list[dict[str, Any]] = []
|
||||
for dep in items:
|
||||
metadata = dep.get("metadata") if isinstance(dep.get("metadata"), dict) else {}
|
||||
spec = dep.get("spec") if isinstance(dep.get("spec"), dict) else {}
|
||||
status = dep.get("status") if isinstance(dep.get("status"), dict) else {}
|
||||
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
|
||||
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
|
||||
desired = int(spec.get("replicas") or 0)
|
||||
ready = int(status.get("readyReplicas") or 0)
|
||||
available = int(status.get("availableReplicas") or 0)
|
||||
updated = int(status.get("updatedReplicas") or 0)
|
||||
if desired <= 0:
|
||||
continue
|
||||
if ready < desired or available < desired:
|
||||
unhealthy.append(
|
||||
{
|
||||
"name": name,
|
||||
"namespace": namespace,
|
||||
"desired": desired,
|
||||
"ready": ready,
|
||||
"available": available,
|
||||
"updated": updated,
|
||||
}
|
||||
)
|
||||
unhealthy.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
|
||||
return {
|
||||
"total": len(items),
|
||||
"not_ready": len(unhealthy),
|
||||
"items": unhealthy,
|
||||
}
|
||||
|
||||
|
||||
def _summarize_statefulsets(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
items = _items(payload)
|
||||
unhealthy: list[dict[str, Any]] = []
|
||||
for st in items:
|
||||
metadata = st.get("metadata") if isinstance(st.get("metadata"), dict) else {}
|
||||
spec = st.get("spec") if isinstance(st.get("spec"), dict) else {}
|
||||
status = st.get("status") if isinstance(st.get("status"), dict) else {}
|
||||
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
|
||||
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
|
||||
desired = int(spec.get("replicas") or 0)
|
||||
ready = int(status.get("readyReplicas") or 0)
|
||||
current = int(status.get("currentReplicas") or 0)
|
||||
updated = int(status.get("updatedReplicas") or 0)
|
||||
if desired <= 0:
|
||||
continue
|
||||
if ready < desired:
|
||||
unhealthy.append(
|
||||
{
|
||||
"name": name,
|
||||
"namespace": namespace,
|
||||
"desired": desired,
|
||||
"ready": ready,
|
||||
"current": current,
|
||||
"updated": updated,
|
||||
}
|
||||
)
|
||||
unhealthy.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
|
||||
return {
|
||||
"total": len(items),
|
||||
"not_ready": len(unhealthy),
|
||||
"items": unhealthy,
|
||||
}
|
||||
|
||||
|
||||
def _summarize_daemonsets(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
items = _items(payload)
|
||||
unhealthy: list[dict[str, Any]] = []
|
||||
for ds in items:
|
||||
metadata = ds.get("metadata") if isinstance(ds.get("metadata"), dict) else {}
|
||||
status = ds.get("status") if isinstance(ds.get("status"), dict) else {}
|
||||
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
|
||||
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
|
||||
desired = int(status.get("desiredNumberScheduled") or 0)
|
||||
ready = int(status.get("numberReady") or 0)
|
||||
updated = int(status.get("updatedNumberScheduled") or 0)
|
||||
if desired <= 0:
|
||||
continue
|
||||
if ready < desired:
|
||||
unhealthy.append(
|
||||
{
|
||||
"name": name,
|
||||
"namespace": namespace,
|
||||
"desired": desired,
|
||||
"ready": ready,
|
||||
"updated": updated,
|
||||
}
|
||||
)
|
||||
unhealthy.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
|
||||
return {
|
||||
"total": len(items),
|
||||
"not_ready": len(unhealthy),
|
||||
"items": unhealthy,
|
||||
}
|
||||
|
||||
|
||||
def _summarize_workload_health(
|
||||
deployments: dict[str, Any],
|
||||
statefulsets: dict[str, Any],
|
||||
daemonsets: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"deployments": deployments,
|
||||
"statefulsets": statefulsets,
|
||||
"daemonsets": daemonsets,
|
||||
}
|
||||
|
||||
|
||||
def _fetch_nodes(errors: list[str]) -> tuple[dict[str, Any], list[dict[str, Any]], dict[str, Any]]:
|
||||
nodes: dict[str, Any] = {}
|
||||
details: list[dict[str, Any]] = []
|
||||
summary: dict[str, Any] = {}
|
||||
try:
|
||||
payload = get_json("/api/v1/nodes")
|
||||
nodes = _summarize_nodes(payload)
|
||||
details = _node_details(payload)
|
||||
summary = _summarize_inventory(details)
|
||||
except Exception as exc:
|
||||
errors.append(f"nodes: {exc}")
|
||||
return nodes, details, summary
|
||||
|
||||
|
||||
def _fetch_flux(errors: list[str]) -> dict[str, Any]:
|
||||
try:
|
||||
payload = get_json(
|
||||
"/apis/kustomize.toolkit.fluxcd.io/v1/namespaces/flux-system/kustomizations"
|
||||
)
|
||||
return _summarize_kustomizations(payload)
|
||||
except Exception as exc:
|
||||
errors.append(f"flux: {exc}")
|
||||
return {}
|
||||
|
||||
|
||||
def _fetch_pods(
|
||||
errors: list[str],
|
||||
) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]], dict[str, Any]]:
|
||||
workloads: list[dict[str, Any]] = []
|
||||
namespace_pods: list[dict[str, Any]] = []
|
||||
namespace_nodes: list[dict[str, Any]] = []
|
||||
node_pods: list[dict[str, Any]] = []
|
||||
pod_issues: dict[str, Any] = {}
|
||||
try:
|
||||
pods_payload = get_json("/api/v1/pods?limit=5000")
|
||||
workloads = _summarize_workloads(pods_payload)
|
||||
namespace_pods = _summarize_namespace_pods(pods_payload)
|
||||
namespace_nodes = _summarize_namespace_nodes(pods_payload)
|
||||
node_pods = _summarize_node_pods(pods_payload)
|
||||
pod_issues = _summarize_pod_issues(pods_payload)
|
||||
except Exception as exc:
|
||||
errors.append(f"pods: {exc}")
|
||||
return workloads, namespace_pods, namespace_nodes, node_pods, pod_issues
|
||||
|
||||
|
||||
def _fetch_workload_health(errors: list[str]) -> dict[str, Any]:
|
||||
try:
|
||||
deployments_payload = get_json("/apis/apps/v1/deployments?limit=2000")
|
||||
statefulsets_payload = get_json("/apis/apps/v1/statefulsets?limit=2000")
|
||||
daemonsets_payload = get_json("/apis/apps/v1/daemonsets?limit=2000")
|
||||
deployments = _summarize_deployments(deployments_payload)
|
||||
statefulsets = _summarize_statefulsets(statefulsets_payload)
|
||||
daemonsets = _summarize_daemonsets(daemonsets_payload)
|
||||
return _summarize_workload_health(deployments, statefulsets, daemonsets)
|
||||
except Exception as exc:
|
||||
errors.append(f"workloads_health: {exc}")
|
||||
return {}
|
||||
|
||||
|
||||
def _fetch_events(errors: list[str]) -> dict[str, Any]:
|
||||
try:
|
||||
events_payload = get_json("/api/v1/events?limit=2000")
|
||||
return _summarize_events(events_payload)
|
||||
except Exception as exc:
|
||||
errors.append(f"events: {exc}")
|
||||
return {}
|
||||
|
||||
|
||||
def _vm_query(expr: str) -> list[dict[str, Any]] | None:
|
||||
base = settings.vm_url
|
||||
if not base:
|
||||
@ -548,11 +1050,29 @@ def _node_usage(errors: list[str]) -> dict[str, Any]:
|
||||
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
||||
"node",
|
||||
)
|
||||
usage["disk"] = _vm_node_metric(
|
||||
'avg by (node) (((1 - avg by (instance) (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} '
|
||||
'/ node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100) * on(instance) group_left(node) '
|
||||
'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
||||
"node",
|
||||
)
|
||||
except Exception as exc:
|
||||
errors.append(f"node_usage: {exc}")
|
||||
return usage
|
||||
|
||||
|
||||
def _pvc_usage(errors: list[str]) -> list[dict[str, Any]]:
|
||||
try:
|
||||
entries = _vm_vector(
|
||||
"topk(5, max by (namespace,persistentvolumeclaim) "
|
||||
"(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))"
|
||||
)
|
||||
return _filter_namespace_vector(entries)
|
||||
except Exception as exc:
|
||||
errors.append(f"pvc_usage: {exc}")
|
||||
return []
|
||||
|
||||
|
||||
def _usage_stats(series: list[dict[str, Any]]) -> dict[str, float]:
|
||||
values: list[float] = []
|
||||
for entry in series:
|
||||
@ -578,6 +1098,12 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
||||
metrics["nodes_ready"] = _vm_scalar(
|
||||
"count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})"
|
||||
)
|
||||
metrics["capacity_cpu"] = _vm_scalar("sum(kube_node_status_capacity_cpu_cores)")
|
||||
metrics["allocatable_cpu"] = _vm_scalar("sum(kube_node_status_allocatable_cpu_cores)")
|
||||
metrics["capacity_mem_bytes"] = _vm_scalar("sum(kube_node_status_capacity_memory_bytes)")
|
||||
metrics["allocatable_mem_bytes"] = _vm_scalar("sum(kube_node_status_allocatable_memory_bytes)")
|
||||
metrics["capacity_pods"] = _vm_scalar("sum(kube_node_status_capacity_pods)")
|
||||
metrics["allocatable_pods"] = _vm_scalar("sum(kube_node_status_allocatable_pods)")
|
||||
metrics["pods_running"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Running\"})")
|
||||
metrics["pods_pending"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Pending\"})")
|
||||
metrics["pods_failed"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Failed\"})")
|
||||
@ -585,6 +1111,19 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
||||
metrics["top_restarts_1h"] = _vm_vector(
|
||||
f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
|
||||
)
|
||||
metrics["pod_cpu_top"] = _filter_namespace_vector(
|
||||
_vm_vector(
|
||||
f'topk(5, sum by (namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))'
|
||||
)
|
||||
)
|
||||
metrics["pod_mem_top"] = _filter_namespace_vector(
|
||||
_vm_vector(
|
||||
"topk(5, sum by (namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"}))"
|
||||
)
|
||||
)
|
||||
metrics["job_failures_24h"] = _vm_vector(
|
||||
"topk(5, sum by (namespace,job_name) (increase(kube_job_status_failed[24h])))"
|
||||
)
|
||||
except Exception as exc:
|
||||
errors.append(f"vm: {exc}")
|
||||
metrics["postgres_connections"] = _postgres_connections(errors)
|
||||
@ -595,6 +1134,7 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
||||
"ram": _usage_stats(metrics.get("node_usage", {}).get("ram", [])),
|
||||
"net": _usage_stats(metrics.get("node_usage", {}).get("net", [])),
|
||||
"io": _usage_stats(metrics.get("node_usage", {}).get("io", [])),
|
||||
"disk": _usage_stats(metrics.get("node_usage", {}).get("disk", [])),
|
||||
}
|
||||
try:
|
||||
metrics["namespace_cpu_top"] = _filter_namespace_vector(
|
||||
@ -607,16 +1147,52 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
||||
"topk(5, sum by (namespace) (container_memory_working_set_bytes{namespace!=\"\"}))"
|
||||
)
|
||||
)
|
||||
metrics["namespace_cpu_requests_top"] = _filter_namespace_vector(
|
||||
_vm_vector(
|
||||
"topk(5, sum by (namespace) (kube_pod_container_resource_requests_cpu_cores))"
|
||||
)
|
||||
)
|
||||
metrics["namespace_mem_requests_top"] = _filter_namespace_vector(
|
||||
_vm_vector(
|
||||
"topk(5, sum by (namespace) (kube_pod_container_resource_requests_memory_bytes))"
|
||||
)
|
||||
)
|
||||
metrics["namespace_net_top"] = _filter_namespace_vector(
|
||||
_vm_vector(
|
||||
f"topk(5, sum by (namespace) (rate(container_network_receive_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}]) + rate(container_network_transmit_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}])))"
|
||||
)
|
||||
)
|
||||
metrics["namespace_io_top"] = _filter_namespace_vector(
|
||||
_vm_vector(
|
||||
f"topk(5, sum by (namespace) (rate(container_fs_reads_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}]) + rate(container_fs_writes_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}])))"
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
errors.append(f"namespace_usage: {exc}")
|
||||
metrics["pvc_usage_top"] = _pvc_usage(errors)
|
||||
metrics["units"] = {
|
||||
"cpu": "percent",
|
||||
"ram": "percent",
|
||||
"net": "bytes_per_sec",
|
||||
"io": "bytes_per_sec",
|
||||
"disk": "percent",
|
||||
"restarts": "count",
|
||||
"pod_cpu": "cores",
|
||||
"pod_mem": "bytes",
|
||||
"job_failures_24h": "count",
|
||||
"namespace_cpu": "cores",
|
||||
"namespace_mem": "bytes",
|
||||
"namespace_cpu_requests": "cores",
|
||||
"namespace_mem_requests": "bytes",
|
||||
"namespace_net": "bytes_per_sec",
|
||||
"namespace_io": "bytes_per_sec",
|
||||
"pvc_used_percent": "percent",
|
||||
"capacity_cpu": "cores",
|
||||
"allocatable_cpu": "cores",
|
||||
"capacity_mem_bytes": "bytes",
|
||||
"allocatable_mem_bytes": "bytes",
|
||||
"capacity_pods": "count",
|
||||
"allocatable_pods": "count",
|
||||
}
|
||||
metrics["windows"] = {
|
||||
"rates": _RATE_WINDOW,
|
||||
@ -629,45 +1205,27 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
|
||||
errors: list[str] = []
|
||||
collected_at = datetime.now(timezone.utc)
|
||||
|
||||
nodes: dict[str, Any] | None = None
|
||||
node_details: list[dict[str, Any]] = []
|
||||
node_summary: dict[str, Any] = {}
|
||||
try:
|
||||
payload = get_json("/api/v1/nodes")
|
||||
nodes = _summarize_nodes(payload)
|
||||
node_details = _node_details(payload)
|
||||
node_summary = _summarize_inventory(node_details)
|
||||
except Exception as exc:
|
||||
errors.append(f"nodes: {exc}")
|
||||
|
||||
kustomizations: dict[str, Any] | None = None
|
||||
try:
|
||||
payload = get_json(
|
||||
"/apis/kustomize.toolkit.fluxcd.io/v1/namespaces/flux-system/kustomizations"
|
||||
)
|
||||
kustomizations = _summarize_kustomizations(payload)
|
||||
except Exception as exc:
|
||||
errors.append(f"flux: {exc}")
|
||||
|
||||
workloads: list[dict[str, Any]] = []
|
||||
namespace_pods: list[dict[str, Any]] = []
|
||||
try:
|
||||
pods_payload = get_json("/api/v1/pods?limit=5000")
|
||||
workloads = _summarize_workloads(pods_payload)
|
||||
namespace_pods = _summarize_namespace_pods(pods_payload)
|
||||
except Exception as exc:
|
||||
errors.append(f"pods: {exc}")
|
||||
nodes, node_details, node_summary = _fetch_nodes(errors)
|
||||
kustomizations = _fetch_flux(errors)
|
||||
workloads, namespace_pods, namespace_nodes, node_pods, pod_issues = _fetch_pods(errors)
|
||||
workload_health = _fetch_workload_health(errors)
|
||||
events = _fetch_events(errors)
|
||||
|
||||
metrics = _summarize_metrics(errors)
|
||||
|
||||
snapshot = {
|
||||
"collected_at": collected_at.isoformat(),
|
||||
"nodes": nodes or {},
|
||||
"nodes": nodes,
|
||||
"nodes_summary": node_summary,
|
||||
"nodes_detail": node_details,
|
||||
"flux": kustomizations or {},
|
||||
"flux": kustomizations,
|
||||
"workloads": workloads,
|
||||
"namespace_pods": namespace_pods,
|
||||
"namespace_nodes": namespace_nodes,
|
||||
"node_pods": node_pods,
|
||||
"pod_issues": pod_issues,
|
||||
"workloads_health": workload_health,
|
||||
"events": events,
|
||||
"metrics": metrics,
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
@ -31,7 +31,16 @@ def test_collect_cluster_state(monkeypatch) -> None:
|
||||
},
|
||||
},
|
||||
{
|
||||
"metadata": {"name": "node-b", "labels": {"kubernetes.io/arch": "amd64"}},
|
||||
"metadata": {
|
||||
"name": "node-b",
|
||||
"labels": {"kubernetes.io/arch": "amd64"},
|
||||
"creationTimestamp": "2026-01-01T00:00:00Z",
|
||||
},
|
||||
"spec": {
|
||||
"taints": [
|
||||
{"key": "node-role.kubernetes.io/control-plane", "effect": "NoSchedule"}
|
||||
]
|
||||
},
|
||||
"status": {
|
||||
"conditions": [{"type": "Ready", "status": "False"}],
|
||||
"nodeInfo": {"architecture": "amd64"},
|
||||
@ -53,6 +62,37 @@ def test_collect_cluster_state(monkeypatch) -> None:
|
||||
}
|
||||
]
|
||||
}
|
||||
if path.startswith("/api/v1/events"):
|
||||
return {"items": []}
|
||||
if path.startswith("/apis/apps/v1/deployments"):
|
||||
return {
|
||||
"items": [
|
||||
{
|
||||
"metadata": {"name": "api", "namespace": "apps"},
|
||||
"spec": {"replicas": 2},
|
||||
"status": {"readyReplicas": 1, "availableReplicas": 1, "updatedReplicas": 1},
|
||||
}
|
||||
]
|
||||
}
|
||||
if path.startswith("/apis/apps/v1/statefulsets"):
|
||||
return {
|
||||
"items": [
|
||||
{
|
||||
"metadata": {"name": "db", "namespace": "apps"},
|
||||
"spec": {"replicas": 1},
|
||||
"status": {"readyReplicas": 1, "currentReplicas": 1, "updatedReplicas": 1},
|
||||
}
|
||||
]
|
||||
}
|
||||
if path.startswith("/apis/apps/v1/daemonsets"):
|
||||
return {
|
||||
"items": [
|
||||
{
|
||||
"metadata": {"name": "agent", "namespace": "apps"},
|
||||
"status": {"desiredNumberScheduled": 3, "numberReady": 3, "updatedNumberScheduled": 3},
|
||||
}
|
||||
]
|
||||
}
|
||||
return {
|
||||
"items": [
|
||||
{
|
||||
@ -78,13 +118,31 @@ def test_collect_cluster_state(monkeypatch) -> None:
|
||||
assert snapshot["flux"]["not_ready"] == 1
|
||||
assert snapshot["nodes_summary"]["total"] == 2
|
||||
assert snapshot["nodes_summary"]["ready"] == 1
|
||||
assert "pressure_nodes" in snapshot["nodes_summary"]
|
||||
assert snapshot["nodes_detail"]
|
||||
assert snapshot["nodes_detail"][1]["age_hours"] is not None
|
||||
assert snapshot["nodes_detail"][1]["taints"]
|
||||
assert snapshot["workloads"]
|
||||
assert snapshot["namespace_pods"]
|
||||
assert snapshot["namespace_pods"][0]["namespace"] == "media"
|
||||
assert snapshot["namespace_nodes"]
|
||||
assert snapshot["node_pods"]
|
||||
assert "pod_issues" in snapshot
|
||||
assert "workloads_health" in snapshot
|
||||
assert snapshot["workloads_health"]["deployments"]["total"] == 1
|
||||
assert snapshot["workloads_health"]["deployments"]["not_ready"] == 1
|
||||
assert snapshot["events"]["warnings_total"] == 0
|
||||
assert "node_usage_stats" in snapshot["metrics"]
|
||||
assert snapshot["metrics"]["namespace_cpu_top"] == []
|
||||
assert snapshot["metrics"]["namespace_mem_top"] == []
|
||||
assert snapshot["metrics"]["namespace_cpu_requests_top"] == []
|
||||
assert snapshot["metrics"]["namespace_mem_requests_top"] == []
|
||||
assert snapshot["metrics"]["namespace_net_top"] == []
|
||||
assert snapshot["metrics"]["namespace_io_top"] == []
|
||||
assert snapshot["metrics"]["pod_cpu_top"] == []
|
||||
assert snapshot["metrics"]["pod_mem_top"] == []
|
||||
assert snapshot["metrics"]["job_failures_24h"] == []
|
||||
assert snapshot["metrics"]["pvc_usage_top"] == []
|
||||
assert summary.nodes_total == 2
|
||||
assert summary.nodes_ready == 1
|
||||
assert summary.pods_running == 5.0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user