atlasbot: expand cluster snapshot events and storage
This commit is contained in:
parent
ef756ff1fa
commit
e809f0b8bd
@ -53,6 +53,8 @@ _PRESSURE_TYPES = {
|
|||||||
"PIDPressure",
|
"PIDPressure",
|
||||||
"NetworkUnavailable",
|
"NetworkUnavailable",
|
||||||
}
|
}
|
||||||
|
_EVENTS_MAX = 20
|
||||||
|
_EVENT_WARNING = "Warning"
|
||||||
_PHASE_SEVERITY = {
|
_PHASE_SEVERITY = {
|
||||||
"Failed": 3,
|
"Failed": 3,
|
||||||
"Pending": 2,
|
"Pending": 2,
|
||||||
@ -354,6 +356,67 @@ def _namespace_allowed(namespace: str) -> bool:
|
|||||||
return namespace not in _SYSTEM_NAMESPACES
|
return namespace not in _SYSTEM_NAMESPACES
|
||||||
|
|
||||||
|
|
||||||
|
def _event_timestamp(event: dict[str, Any]) -> str:
|
||||||
|
for key in ("eventTime", "lastTimestamp", "firstTimestamp"):
|
||||||
|
value = event.get(key)
|
||||||
|
if isinstance(value, str) and value:
|
||||||
|
return value
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _event_sort_key(timestamp: str) -> float:
|
||||||
|
if not timestamp:
|
||||||
|
return 0.0
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(timestamp.replace("Z", "+00:00")).timestamp()
|
||||||
|
except ValueError:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def _summarize_events(payload: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
warnings: list[dict[str, Any]] = []
|
||||||
|
by_reason: dict[str, int] = {}
|
||||||
|
by_namespace: dict[str, int] = {}
|
||||||
|
for event in _items(payload):
|
||||||
|
metadata = event.get("metadata") if isinstance(event.get("metadata"), dict) else {}
|
||||||
|
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
|
||||||
|
if not _namespace_allowed(namespace):
|
||||||
|
continue
|
||||||
|
event_type = event.get("type") if isinstance(event.get("type"), str) else ""
|
||||||
|
if event_type != _EVENT_WARNING:
|
||||||
|
continue
|
||||||
|
reason = event.get("reason") if isinstance(event.get("reason"), str) else ""
|
||||||
|
message = event.get("message") if isinstance(event.get("message"), str) else ""
|
||||||
|
count = event.get("count") if isinstance(event.get("count"), int) else 1
|
||||||
|
involved = (
|
||||||
|
event.get("involvedObject") if isinstance(event.get("involvedObject"), dict) else {}
|
||||||
|
)
|
||||||
|
timestamp = _event_timestamp(event)
|
||||||
|
warnings.append(
|
||||||
|
{
|
||||||
|
"namespace": namespace,
|
||||||
|
"reason": reason,
|
||||||
|
"message": message,
|
||||||
|
"count": count,
|
||||||
|
"last_seen": timestamp,
|
||||||
|
"object_kind": involved.get("kind") or "",
|
||||||
|
"object_name": involved.get("name") or "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if reason:
|
||||||
|
by_reason[reason] = by_reason.get(reason, 0) + count
|
||||||
|
if namespace:
|
||||||
|
by_namespace[namespace] = by_namespace.get(namespace, 0) + count
|
||||||
|
warnings.sort(key=lambda item: _event_sort_key(item.get("last_seen") or ""), reverse=True)
|
||||||
|
top = warnings[:_EVENTS_MAX]
|
||||||
|
return {
|
||||||
|
"warnings_total": len(warnings),
|
||||||
|
"warnings_by_reason": by_reason,
|
||||||
|
"warnings_by_namespace": by_namespace,
|
||||||
|
"warnings_recent": top,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _workload_from_labels(labels: dict[str, Any]) -> tuple[str, str]:
|
def _workload_from_labels(labels: dict[str, Any]) -> tuple[str, str]:
|
||||||
for key in _WORKLOAD_LABEL_KEYS:
|
for key in _WORKLOAD_LABEL_KEYS:
|
||||||
value = labels.get(key)
|
value = labels.get(key)
|
||||||
@ -770,11 +833,29 @@ def _node_usage(errors: list[str]) -> dict[str, Any]:
|
|||||||
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
||||||
"node",
|
"node",
|
||||||
)
|
)
|
||||||
|
usage["disk"] = _vm_node_metric(
|
||||||
|
'avg by (node) (((1 - avg by (instance) (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} '
|
||||||
|
'/ node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100) * on(instance) group_left(node) '
|
||||||
|
'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))',
|
||||||
|
"node",
|
||||||
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
errors.append(f"node_usage: {exc}")
|
errors.append(f"node_usage: {exc}")
|
||||||
return usage
|
return usage
|
||||||
|
|
||||||
|
|
||||||
|
def _pvc_usage(errors: list[str]) -> list[dict[str, Any]]:
|
||||||
|
try:
|
||||||
|
entries = _vm_vector(
|
||||||
|
"topk(5, max by (namespace,persistentvolumeclaim) "
|
||||||
|
"(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))"
|
||||||
|
)
|
||||||
|
return _filter_namespace_vector(entries)
|
||||||
|
except Exception as exc:
|
||||||
|
errors.append(f"pvc_usage: {exc}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def _usage_stats(series: list[dict[str, Any]]) -> dict[str, float]:
|
def _usage_stats(series: list[dict[str, Any]]) -> dict[str, float]:
|
||||||
values: list[float] = []
|
values: list[float] = []
|
||||||
for entry in series:
|
for entry in series:
|
||||||
@ -823,6 +904,7 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
|||||||
"ram": _usage_stats(metrics.get("node_usage", {}).get("ram", [])),
|
"ram": _usage_stats(metrics.get("node_usage", {}).get("ram", [])),
|
||||||
"net": _usage_stats(metrics.get("node_usage", {}).get("net", [])),
|
"net": _usage_stats(metrics.get("node_usage", {}).get("net", [])),
|
||||||
"io": _usage_stats(metrics.get("node_usage", {}).get("io", [])),
|
"io": _usage_stats(metrics.get("node_usage", {}).get("io", [])),
|
||||||
|
"disk": _usage_stats(metrics.get("node_usage", {}).get("disk", [])),
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
metrics["namespace_cpu_top"] = _filter_namespace_vector(
|
metrics["namespace_cpu_top"] = _filter_namespace_vector(
|
||||||
@ -837,14 +919,17 @@ def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
|
|||||||
)
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
errors.append(f"namespace_usage: {exc}")
|
errors.append(f"namespace_usage: {exc}")
|
||||||
|
metrics["pvc_usage_top"] = _pvc_usage(errors)
|
||||||
metrics["units"] = {
|
metrics["units"] = {
|
||||||
"cpu": "percent",
|
"cpu": "percent",
|
||||||
"ram": "percent",
|
"ram": "percent",
|
||||||
"net": "bytes_per_sec",
|
"net": "bytes_per_sec",
|
||||||
"io": "bytes_per_sec",
|
"io": "bytes_per_sec",
|
||||||
|
"disk": "percent",
|
||||||
"restarts": "count",
|
"restarts": "count",
|
||||||
"namespace_cpu": "cores",
|
"namespace_cpu": "cores",
|
||||||
"namespace_mem": "bytes",
|
"namespace_mem": "bytes",
|
||||||
|
"pvc_used_percent": "percent",
|
||||||
"capacity_cpu": "cores",
|
"capacity_cpu": "cores",
|
||||||
"allocatable_cpu": "cores",
|
"allocatable_cpu": "cores",
|
||||||
"capacity_mem_bytes": "bytes",
|
"capacity_mem_bytes": "bytes",
|
||||||
@ -898,6 +983,13 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
|
|||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
errors.append(f"pods: {exc}")
|
errors.append(f"pods: {exc}")
|
||||||
|
|
||||||
|
events: dict[str, Any] = {}
|
||||||
|
try:
|
||||||
|
events_payload = get_json("/api/v1/events?limit=2000")
|
||||||
|
events = _summarize_events(events_payload)
|
||||||
|
except Exception as exc:
|
||||||
|
errors.append(f"events: {exc}")
|
||||||
|
|
||||||
metrics = _summarize_metrics(errors)
|
metrics = _summarize_metrics(errors)
|
||||||
|
|
||||||
snapshot = {
|
snapshot = {
|
||||||
@ -911,6 +1003,7 @@ def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
|
|||||||
"namespace_nodes": namespace_nodes,
|
"namespace_nodes": namespace_nodes,
|
||||||
"node_pods": node_pods,
|
"node_pods": node_pods,
|
||||||
"pod_issues": pod_issues,
|
"pod_issues": pod_issues,
|
||||||
|
"events": events,
|
||||||
"metrics": metrics,
|
"metrics": metrics,
|
||||||
"errors": errors,
|
"errors": errors,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -53,6 +53,8 @@ def test_collect_cluster_state(monkeypatch) -> None:
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
if path.startswith("/api/v1/events"):
|
||||||
|
return {"items": []}
|
||||||
return {
|
return {
|
||||||
"items": [
|
"items": [
|
||||||
{
|
{
|
||||||
@ -86,9 +88,11 @@ def test_collect_cluster_state(monkeypatch) -> None:
|
|||||||
assert snapshot["namespace_nodes"]
|
assert snapshot["namespace_nodes"]
|
||||||
assert snapshot["node_pods"]
|
assert snapshot["node_pods"]
|
||||||
assert "pod_issues" in snapshot
|
assert "pod_issues" in snapshot
|
||||||
|
assert snapshot["events"]["warnings_total"] == 0
|
||||||
assert "node_usage_stats" in snapshot["metrics"]
|
assert "node_usage_stats" in snapshot["metrics"]
|
||||||
assert snapshot["metrics"]["namespace_cpu_top"] == []
|
assert snapshot["metrics"]["namespace_cpu_top"] == []
|
||||||
assert snapshot["metrics"]["namespace_mem_top"] == []
|
assert snapshot["metrics"]["namespace_mem_top"] == []
|
||||||
|
assert snapshot["metrics"]["pvc_usage_top"] == []
|
||||||
assert summary.nodes_total == 2
|
assert summary.nodes_total == 2
|
||||||
assert summary.nodes_ready == 1
|
assert summary.nodes_ready == 1
|
||||||
assert summary.pods_running == 5.0
|
assert summary.pods_running == 5.0
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user