ariadne/ariadne/services/cluster_state_nodes.py

402 lines
15 KiB
Python

from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from .cluster_state_contract import *
def _node_usage_by_hardware(node_load: list[dict[str, Any]], node_details: list[dict[str, Any]]) -> list[dict[str, Any]]:
if not node_load or not node_details:
return []
hardware_by_node = _hardware_map(node_details)
buckets: dict[str, dict[str, list[float]]] = {}
for entry in node_load:
if not isinstance(entry, dict):
continue
node = entry.get("node")
if not isinstance(node, str) or not node:
continue
hardware = hardware_by_node.get(node, "unknown")
_append_hardware_usage(buckets, str(hardware), entry)
return _finalize_hardware_usage(buckets)
def _hardware_map(node_details: list[dict[str, Any]]) -> dict[str, str]:
mapping: dict[str, str] = {}
for node in node_details:
if not isinstance(node, dict):
continue
name = node.get("name")
if isinstance(name, str) and name:
mapping[name] = str(node.get("hardware") or "unknown")
return mapping
def _append_hardware_usage(buckets: dict[str, dict[str, list[float]]], hardware: str, entry: dict[str, Any]) -> None:
bucket = buckets.setdefault(hardware, {"load_index": [], "cpu": [], "ram": [], "net": [], "io": []})
for key in ("load_index", "cpu", "ram", "net", "io"):
value = entry.get(key)
if isinstance(value, (int, float)):
bucket[key].append(float(value))
def _finalize_hardware_usage(buckets: dict[str, dict[str, list[float]]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for hardware, metrics in buckets.items():
row: dict[str, Any] = {"hardware": hardware}
for key, values in metrics.items():
if values:
row[key] = sum(values) / len(values)
output.append(row)
output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("hardware") or ""))
return output
def _node_ready(conditions: Any) -> bool:
if not isinstance(conditions, list):
return False
for condition in conditions:
if not isinstance(condition, dict):
continue
if condition.get("type") == "Ready":
return condition.get("status") == "True"
return False
def _summarize_nodes(payload: dict[str, Any]) -> dict[str, Any]:
names: list[str] = []
not_ready: list[str] = []
for node in _items(payload):
metadata = node.get("metadata") if isinstance(node.get("metadata"), dict) else {}
status = node.get("status") if isinstance(node.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
if not name:
continue
names.append(name)
if not _node_ready(status.get("conditions")):
not_ready.append(name)
names.sort()
not_ready.sort()
total = len(names)
ready = total - len(not_ready)
return {
"total": total,
"ready": ready,
"not_ready": len(not_ready),
"names": names,
"not_ready_names": not_ready,
}
def _node_labels(labels: dict[str, Any]) -> dict[str, Any]:
if not isinstance(labels, dict):
return {}
keep: dict[str, Any] = {}
for key, value in labels.items():
if key.startswith("node-role.kubernetes.io/"):
keep[key] = value
if key in {
"kubernetes.io/arch",
"kubernetes.io/hostname",
"beta.kubernetes.io/arch",
"hardware",
"jetson",
}:
keep[key] = value
return keep
def _node_addresses(status: dict[str, Any]) -> dict[str, str]:
addresses = status.get("addresses") if isinstance(status.get("addresses"), list) else []
output: dict[str, str] = {}
for addr in addresses:
if not isinstance(addr, dict):
continue
addr_type = addr.get("type")
addr_value = addr.get("address")
if isinstance(addr_type, str) and isinstance(addr_value, str):
output[addr_type] = addr_value
return output
def _node_details(payload: dict[str, Any]) -> list[dict[str, Any]]:
details: list[dict[str, Any]] = []
for node in _items(payload):
metadata = node.get("metadata") if isinstance(node.get("metadata"), dict) else {}
spec = node.get("spec") if isinstance(node.get("spec"), dict) else {}
status = node.get("status") if isinstance(node.get("status"), dict) else {}
node_info = status.get("nodeInfo") if isinstance(status.get("nodeInfo"), dict) else {}
labels = metadata.get("labels") if isinstance(metadata.get("labels"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
if not name:
continue
roles = _node_roles(labels)
conditions = _node_pressure_conditions(status.get("conditions"))
created_at = metadata.get("creationTimestamp") if isinstance(metadata.get("creationTimestamp"), str) else ""
taints = _node_taints(spec.get("taints"))
details.append(
{
"name": name,
"ready": _node_ready(status.get("conditions")),
"roles": roles,
"is_worker": _node_is_worker(labels),
"labels": _node_labels(labels),
"hardware": _hardware_hint(labels, node_info),
"arch": node_info.get("architecture") or "",
"os": node_info.get("operatingSystem") or "",
"kernel": node_info.get("kernelVersion") or "",
"kubelet": node_info.get("kubeletVersion") or "",
"container_runtime": node_info.get("containerRuntimeVersion") or "",
"addresses": _node_addresses(status),
"created_at": created_at,
"age_hours": _age_hours(created_at),
"taints": taints,
"unschedulable": bool(spec.get("unschedulable")),
"capacity": _node_capacity(status.get("capacity")),
"allocatable": _node_capacity(status.get("allocatable")),
"pressure": conditions,
}
)
details.sort(key=lambda item: item.get("name") or "")
return details
def _age_hours(timestamp: str) -> float | None:
if not timestamp:
return None
try:
parsed = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
except ValueError:
return None
return round((datetime.now(timezone.utc) - parsed).total_seconds() / 3600, 1)
def _node_age_stats(details: list[dict[str, Any]]) -> dict[str, Any]:
ages: list[tuple[str, float]] = []
for node in details:
name = node.get("name") if isinstance(node, dict) else ""
age = node.get("age_hours")
if isinstance(name, str) and name and isinstance(age, (int, float)):
ages.append((name, float(age)))
if not ages:
return {}
ages.sort(key=lambda item: item[1])
values = [age for _, age in ages]
return {
"min": round(min(values), 1),
"max": round(max(values), 1),
"avg": round(sum(values) / len(values), 1),
"youngest": [{"name": name, "age_hours": age} for name, age in ages[:5]],
"oldest": [{"name": name, "age_hours": age} for name, age in ages[-5:]],
}
def _node_flagged(details: list[dict[str, Any]], key: str) -> list[str]:
names: list[str] = []
for node in details:
name = node.get("name") if isinstance(node, dict) else ""
if not isinstance(name, str) or not name:
continue
if node.get(key):
names.append(name)
names.sort()
return names
def _node_taints(raw: Any) -> list[dict[str, str]]:
if not isinstance(raw, list):
return []
taints: list[dict[str, str]] = []
for entry in raw:
if not isinstance(entry, dict):
continue
key = entry.get("key")
effect = entry.get("effect")
value = entry.get("value")
if isinstance(key, str) and isinstance(effect, str):
taints.append(
{
"key": key,
"value": value if isinstance(value, str) else "",
"effect": effect,
}
)
return taints
def _summarize_inventory(details: list[dict[str, Any]]) -> dict[str, Any]:
summary = {
"total": 0,
"ready": 0,
"workers": {"total": 0, "ready": 0},
"by_hardware": {},
"by_arch": {},
"by_role": {},
"not_ready_names": [],
"pressure_nodes": {key: [] for key in _PRESSURE_TYPES},
"age_stats": {},
"tainted_nodes": [],
"unschedulable_nodes": [],
}
not_ready: list[str] = []
for node in details:
name = _apply_node_summary(summary, node)
if name and not node.get("ready"):
not_ready.append(name)
not_ready.sort()
summary["not_ready_names"] = not_ready
for cond_type in summary["pressure_nodes"]:
summary["pressure_nodes"][cond_type].sort()
summary["age_stats"] = _node_age_stats(details)
summary["tainted_nodes"] = _node_flagged(details, "taints")
summary["unschedulable_nodes"] = _node_flagged(details, "unschedulable")
return summary
def _hardware_groups(details: list[dict[str, Any]]) -> list[dict[str, Any]]:
groups: dict[str, list[str]] = {}
for node in details:
if not isinstance(node, dict):
continue
name = node.get("name")
if not isinstance(name, str) or not name:
continue
hardware = str(node.get("hardware") or "unknown")
groups.setdefault(hardware, []).append(name)
output: list[dict[str, Any]] = []
for hardware, nodes in groups.items():
nodes.sort()
output.append({"hardware": hardware, "count": len(nodes), "nodes": nodes})
output.sort(key=lambda item: (-(item.get("count") or 0), item.get("hardware") or ""))
return output
def _pressure_summary(nodes_summary: dict[str, Any]) -> dict[str, Any]:
pressure_nodes = nodes_summary.get("pressure_nodes") if isinstance(nodes_summary, dict) else {}
summary: dict[str, Any] = {"by_type": {}, "total": 0}
if isinstance(pressure_nodes, dict):
for cond, names in pressure_nodes.items():
count = len(names) if isinstance(names, list) else 0
summary["by_type"][cond] = count
summary["total"] += count
unschedulable = nodes_summary.get("unschedulable_nodes") or []
summary["unschedulable"] = len(unschedulable) if isinstance(unschedulable, list) else 0
return summary
def _apply_node_summary(summary: dict[str, Any], node: dict[str, Any]) -> str:
name = node.get("name") if isinstance(node, dict) else ""
if not isinstance(name, str) or not name:
return ""
summary["total"] += 1
ready = bool(node.get("ready"))
if ready:
summary["ready"] += 1
if node.get("is_worker"):
summary["workers"]["total"] += 1
if ready:
summary["workers"]["ready"] += 1
hardware = node.get("hardware") or "unknown"
arch = node.get("arch") or "unknown"
summary["by_hardware"][hardware] = summary["by_hardware"].get(hardware, 0) + 1
summary["by_arch"][arch] = summary["by_arch"].get(arch, 0) + 1
for role in node.get("roles") or []:
summary["by_role"][role] = summary["by_role"].get(role, 0) + 1
_apply_pressure(summary, node, name)
return name
def _apply_pressure(summary: dict[str, Any], node: dict[str, Any], name: str) -> None:
pressure = node.get("pressure") or {}
if not isinstance(pressure, dict):
return
for cond_type, active in pressure.items():
if active and cond_type in summary["pressure_nodes"]:
summary["pressure_nodes"][cond_type].append(name)
def _node_capacity(raw: Any) -> dict[str, str]:
if not isinstance(raw, dict):
return {}
output: dict[str, str] = {}
for key in _CAPACITY_KEYS:
value = raw.get(key)
if isinstance(value, (str, int, float)) and value != "":
output[key] = str(value)
return output
def _node_pressure_conditions(conditions: Any) -> dict[str, bool]:
if not isinstance(conditions, list):
return {}
pressure: dict[str, bool] = {}
for condition in conditions:
if not isinstance(condition, dict):
continue
cond_type = condition.get("type")
if cond_type in _PRESSURE_TYPES:
pressure[cond_type] = condition.get("status") == "True"
return pressure
def _node_roles(labels: dict[str, Any]) -> list[str]:
roles: list[str] = []
for key in labels.keys():
if key.startswith("node-role.kubernetes.io/"):
role = key.split("/", 1)[-1]
if role:
roles.append(role)
return sorted(set(roles))
def _node_is_worker(labels: dict[str, Any]) -> bool:
if "node-role.kubernetes.io/control-plane" in labels:
return False
if "node-role.kubernetes.io/master" in labels:
return False
if "node-role.kubernetes.io/worker" in labels:
return True
return True
def _hardware_hint(labels: dict[str, Any], node_info: dict[str, Any]) -> str:
result = "unknown"
if str(labels.get("jetson") or "").lower() == "true":
result = "jetson"
else:
hardware = (labels.get("hardware") or "").strip().lower()
if hardware:
result = hardware
else:
kernel = str(node_info.get("kernelVersion") or "").lower()
os_image = str(node_info.get("osImage") or "").lower()
if "tegra" in kernel or "jetson" in os_image:
result = "jetson"
elif "raspi" in kernel or "bcm2711" in kernel:
result = "rpi"
else:
arch = str(node_info.get("architecture") or "").lower()
if arch == "amd64":
result = "amd64"
elif arch == "arm64":
result = "arm64-unknown"
return result
def _condition_status(conditions: Any, cond_type: str) -> tuple[bool | None, str, str]:
if not isinstance(conditions, list):
return None, "", ""
for condition in conditions:
if not isinstance(condition, dict):
continue
if condition.get("type") != cond_type:
continue
status = condition.get("status")
if status == "True":
return True, condition.get("reason") or "", condition.get("message") or ""
if status == "False":
return False, condition.get("reason") or "", condition.get("message") or ""
return None, condition.get("reason") or "", condition.get("message") or ""
return None, "", ""
__all__ = [name for name in globals() if (name.startswith("_") and not name.startswith("__")) or name in {"ClusterStateSummary", "SignalContext"}]