493 lines
18 KiB
Python
493 lines
18 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
from atlasbot.config import Settings
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
PVC_USAGE_CRITICAL = 90
|
|
|
|
_BYTES_KB = 1024
|
|
_BYTES_MB = 1024 * 1024
|
|
_BYTES_GB = 1024 * 1024 * 1024
|
|
_VALUE_PAIR_LEN = 2
|
|
class SnapshotProvider:
|
|
"""Fetch and cache the Ariadne snapshot used by the answer engine."""
|
|
|
|
def __init__(self, settings: Settings) -> None:
|
|
self._settings = settings
|
|
self._cache: dict[str, Any] = {}
|
|
self._cache_ts = 0.0
|
|
|
|
def _cache_valid(self) -> bool:
|
|
return time.monotonic() - self._cache_ts < max(5, self._settings.snapshot_ttl_sec)
|
|
|
|
def get(self) -> dict[str, Any] | None:
|
|
"""Return the cached snapshot or refresh it from Ariadne."""
|
|
|
|
if self._cache and self._cache_valid():
|
|
return self._cache
|
|
if not self._settings.ariadne_state_url:
|
|
return self._cache or None
|
|
headers = {}
|
|
if self._settings.ariadne_state_token:
|
|
headers["x-internal-token"] = self._settings.ariadne_state_token
|
|
try:
|
|
resp = httpx.get(self._settings.ariadne_state_url, headers=headers, timeout=10.0)
|
|
resp.raise_for_status()
|
|
payload = resp.json()
|
|
if isinstance(payload, dict):
|
|
self._cache = payload
|
|
self._cache_ts = time.monotonic()
|
|
return payload
|
|
except Exception as exc:
|
|
log.warning("snapshot fetch failed", extra={"extra": {"error": str(exc)}})
|
|
return self._cache or None
|
|
|
|
|
|
def _node_usage_top(series: list[dict[str, Any]]) -> dict[str, Any] | None:
|
|
best = None
|
|
for entry in series or []:
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
node = entry.get("node")
|
|
value = entry.get("value")
|
|
try:
|
|
numeric = float(value)
|
|
except (TypeError, ValueError):
|
|
continue
|
|
if best is None or numeric > best["value"]:
|
|
best = {"node": node, "value": numeric}
|
|
return best
|
|
|
|
|
|
def build_summary(snapshot: dict[str, Any] | None) -> dict[str, Any]:
|
|
"""Condense a raw snapshot into the summary shape used for prompts."""
|
|
|
|
if not snapshot:
|
|
return {}
|
|
from .core_b import (
|
|
_build_flux,
|
|
_build_hottest,
|
|
_build_namespace_capacity,
|
|
_build_namespace_capacity_summary,
|
|
_build_node_load_summary,
|
|
_build_pvc,
|
|
_build_workloads,
|
|
)
|
|
from .format_c import _build_cluster_watchlist
|
|
|
|
nodes_detail = _nodes_detail(snapshot)
|
|
metrics = _metrics(snapshot)
|
|
summary: dict[str, Any] = {}
|
|
|
|
if isinstance(snapshot.get("nodes_summary"), dict):
|
|
summary["nodes_summary"] = snapshot.get("nodes_summary")
|
|
if metrics:
|
|
summary["metrics"] = metrics
|
|
if isinstance(snapshot.get("jobs"), dict):
|
|
summary["jobs"] = snapshot.get("jobs")
|
|
summary.update(_build_nodes(snapshot))
|
|
summary.update(_build_pressure(snapshot))
|
|
summary.update(_build_hardware(nodes_detail))
|
|
summary.update(_build_hardware_by_node(nodes_detail))
|
|
summary.update(_build_hardware_usage(metrics, summary.get("hardware_by_node")))
|
|
summary.update(_build_node_facts(nodes_detail))
|
|
summary.update(_build_node_ages(nodes_detail))
|
|
summary.update(_build_node_taints(nodes_detail))
|
|
summary.update(_build_capacity(metrics))
|
|
summary.update(_build_pods(metrics))
|
|
summary.update(_build_namespace_pods(snapshot))
|
|
summary.update(_build_namespace_nodes(snapshot))
|
|
summary.update(_build_node_pods(snapshot))
|
|
summary.update(_build_node_pods_top(metrics))
|
|
summary.update(_build_pod_issues(snapshot))
|
|
summary.update(_build_workload_health(snapshot))
|
|
summary.update(_build_events(snapshot))
|
|
summary.update(_build_event_summary(snapshot))
|
|
summary.update(_build_postgres(metrics))
|
|
summary.update(_build_hottest(metrics))
|
|
summary.update(_build_pvc(metrics))
|
|
summary.update(_build_namespace_capacity(metrics))
|
|
summary.update(_build_namespace_capacity_summary(metrics))
|
|
summary.update(_build_longhorn(snapshot))
|
|
summary.update(_build_root_disk_headroom(metrics))
|
|
summary.update(_build_node_load(metrics))
|
|
summary.update(_build_node_load_summary(metrics))
|
|
summary.update(_build_cluster_watchlist(summary))
|
|
summary.update(_build_workloads(snapshot))
|
|
summary.update(_build_flux(snapshot))
|
|
_merge_cluster_summary(snapshot, summary)
|
|
_augment_lexicon(summary)
|
|
return summary
|
|
|
|
|
|
def _merge_cluster_summary(snapshot: dict[str, Any], summary: dict[str, Any]) -> None:
|
|
cluster_summary = snapshot.get("summary") if isinstance(snapshot.get("summary"), dict) else {}
|
|
if not cluster_summary:
|
|
return
|
|
_merge_cluster_fields(
|
|
summary,
|
|
cluster_summary,
|
|
{
|
|
"signals": list,
|
|
"profiles": dict,
|
|
"inventory": dict,
|
|
"topology": dict,
|
|
"lexicon": dict,
|
|
"cross_stats": dict,
|
|
"baseline_deltas": dict,
|
|
"pod_issue_summary": dict,
|
|
"trend_requests": dict,
|
|
"pod_waiting_trends": dict,
|
|
"pod_terminated_trends": dict,
|
|
},
|
|
)
|
|
|
|
|
|
def _merge_cluster_fields(summary: dict[str, Any], cluster_summary: dict[str, Any], field_types: dict[str, type]) -> None:
|
|
for key, expected in field_types.items():
|
|
value = cluster_summary.get(key)
|
|
if isinstance(value, expected):
|
|
summary[key] = value
|
|
|
|
|
|
def _augment_lexicon(summary: dict[str, Any]) -> None:
|
|
lexicon = summary.get("lexicon")
|
|
if not isinstance(lexicon, dict):
|
|
lexicon = {"terms": [], "aliases": {}}
|
|
terms = list(lexicon.get("terms") or [])
|
|
aliases = dict(lexicon.get("aliases") or {})
|
|
hardware = summary.get("hardware") if isinstance(summary.get("hardware"), dict) else {}
|
|
hardware_map = {
|
|
"rpi5": "Raspberry Pi 5 nodes",
|
|
"rpi4": "Raspberry Pi 4 nodes",
|
|
"rpi": "Raspberry Pi nodes",
|
|
"jetson": "NVIDIA Jetson nodes",
|
|
"amd64": "AMD64 nodes",
|
|
}
|
|
existing_terms = {entry.get("term") for entry in terms if isinstance(entry, dict)}
|
|
for key, meaning in hardware_map.items():
|
|
if key not in hardware:
|
|
continue
|
|
if key not in existing_terms:
|
|
terms.append({"term": key, "meaning": meaning})
|
|
if key not in aliases:
|
|
aliases[key] = meaning
|
|
if "raspberry pi 5" not in aliases and "rpi5" in hardware:
|
|
aliases["raspberry pi 5"] = "rpi5"
|
|
if "raspberry pi 4" not in aliases and "rpi4" in hardware:
|
|
aliases["raspberry pi 4"] = "rpi4"
|
|
lexicon["terms"] = terms
|
|
lexicon["aliases"] = aliases
|
|
summary["lexicon"] = lexicon
|
|
|
|
|
|
def _nodes_detail(snapshot: dict[str, Any]) -> list[dict[str, Any]]:
|
|
items = snapshot.get("nodes_detail")
|
|
return items if isinstance(items, list) else []
|
|
|
|
|
|
def _metrics(snapshot: dict[str, Any]) -> dict[str, Any]:
|
|
metrics = snapshot.get("metrics")
|
|
return metrics if isinstance(metrics, dict) else {}
|
|
|
|
|
|
def _build_nodes(snapshot: dict[str, Any]) -> dict[str, Any]:
|
|
nodes_summary = snapshot.get("nodes_summary") if isinstance(snapshot.get("nodes_summary"), dict) else {}
|
|
if not nodes_summary:
|
|
return {}
|
|
return {
|
|
"nodes": {
|
|
"total": nodes_summary.get("total"),
|
|
"ready": nodes_summary.get("ready"),
|
|
"not_ready": nodes_summary.get("not_ready"),
|
|
}
|
|
}
|
|
|
|
|
|
def _build_pressure(snapshot: dict[str, Any]) -> dict[str, Any]:
|
|
nodes_summary = snapshot.get("nodes_summary") if isinstance(snapshot.get("nodes_summary"), dict) else {}
|
|
pressure = nodes_summary.get("pressure_nodes") if isinstance(nodes_summary.get("pressure_nodes"), dict) else {}
|
|
if not pressure:
|
|
return {}
|
|
return {"pressure_nodes": pressure}
|
|
|
|
|
|
def _build_hardware(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
|
|
hardware: dict[str, list[str]] = {}
|
|
for node in nodes_detail or []:
|
|
if not isinstance(node, dict):
|
|
continue
|
|
name = node.get("name")
|
|
hardware_class = node.get("hardware") or "unknown"
|
|
if name:
|
|
hardware.setdefault(hardware_class, []).append(name)
|
|
if not hardware:
|
|
return {}
|
|
return {"hardware": {key: sorted(value) for key, value in hardware.items()}}
|
|
|
|
|
|
def _build_hardware_by_node(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
|
|
mapping: dict[str, str] = {}
|
|
for node in nodes_detail or []:
|
|
if not isinstance(node, dict):
|
|
continue
|
|
name = node.get("name")
|
|
if isinstance(name, str) and name:
|
|
hardware = node.get("hardware") or "unknown"
|
|
mapping[name] = str(hardware)
|
|
return {"hardware_by_node": mapping} if mapping else {}
|
|
|
|
|
|
def _build_hardware_usage(metrics: dict[str, Any], hardware_by_node: dict[str, Any] | None) -> dict[str, Any]: # noqa: C901
|
|
if not isinstance(hardware_by_node, dict) or not hardware_by_node:
|
|
return {}
|
|
node_load = metrics.get("node_load") if isinstance(metrics.get("node_load"), list) else []
|
|
if not node_load:
|
|
return {}
|
|
buckets: dict[str, dict[str, list[float]]] = {}
|
|
for entry in node_load:
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
node = entry.get("node")
|
|
if not isinstance(node, str) or not node:
|
|
continue
|
|
hardware = hardware_by_node.get(node, "unknown")
|
|
bucket = buckets.setdefault(str(hardware), {"load_index": [], "cpu": [], "ram": [], "net": [], "io": []})
|
|
for key in ("load_index", "cpu", "ram", "net", "io"):
|
|
value = entry.get(key)
|
|
if isinstance(value, (int, float)):
|
|
bucket[key].append(float(value))
|
|
output: list[dict[str, Any]] = []
|
|
for hardware, metrics_bucket in buckets.items():
|
|
row: dict[str, Any] = {"hardware": hardware}
|
|
for key, values in metrics_bucket.items():
|
|
if values:
|
|
row[key] = sum(values) / len(values)
|
|
output.append(row)
|
|
output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("hardware") or ""))
|
|
return {"hardware_usage_avg": output}
|
|
|
|
|
|
def _build_node_ages(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
|
|
ages: list[dict[str, Any]] = []
|
|
for node in nodes_detail or []:
|
|
if not isinstance(node, dict):
|
|
continue
|
|
name = node.get("name")
|
|
age = node.get("age_hours")
|
|
if name and isinstance(age, (int, float)):
|
|
ages.append({"name": name, "age_hours": age})
|
|
ages.sort(key=lambda item: -(item.get("age_hours") or 0))
|
|
return {"node_ages": ages[:5]} if ages else {}
|
|
|
|
|
|
def _count_values(nodes_detail: list[dict[str, Any]], key: str) -> dict[str, int]:
|
|
counts: dict[str, int] = {}
|
|
for node in nodes_detail or []:
|
|
if not isinstance(node, dict):
|
|
continue
|
|
value = node.get(key)
|
|
if isinstance(value, str) and value:
|
|
counts[value] = counts.get(value, 0) + 1
|
|
return counts
|
|
|
|
|
|
def _build_node_facts(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
|
|
if not nodes_detail:
|
|
return {}
|
|
role_counts: dict[str, int] = {}
|
|
for node in nodes_detail:
|
|
if not isinstance(node, dict):
|
|
continue
|
|
if node.get("is_worker"):
|
|
role_counts["worker"] = role_counts.get("worker", 0) + 1
|
|
roles = node.get("roles")
|
|
if isinstance(roles, list):
|
|
for role in roles:
|
|
if isinstance(role, str) and role:
|
|
role_counts[role] = role_counts.get(role, 0) + 1
|
|
return {
|
|
"node_arch_counts": _count_values(nodes_detail, "arch"),
|
|
"node_os_counts": _count_values(nodes_detail, "os"),
|
|
"node_kubelet_versions": _count_values(nodes_detail, "kubelet"),
|
|
"node_kernel_versions": _count_values(nodes_detail, "kernel"),
|
|
"node_runtime_versions": _count_values(nodes_detail, "container_runtime"),
|
|
"node_role_counts": role_counts,
|
|
}
|
|
|
|
|
|
def _build_node_taints(nodes_detail: list[dict[str, Any]]) -> dict[str, Any]:
|
|
taints: dict[str, list[str]] = {}
|
|
for node in nodes_detail or []:
|
|
if not isinstance(node, dict):
|
|
continue
|
|
name = node.get("name")
|
|
if not name:
|
|
continue
|
|
entries = node.get("taints") if isinstance(node.get("taints"), list) else []
|
|
for entry in entries:
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
key = entry.get("key")
|
|
effect = entry.get("effect")
|
|
if isinstance(key, str) and isinstance(effect, str):
|
|
label = f"{key}:{effect}"
|
|
taints.setdefault(label, []).append(name)
|
|
if not taints:
|
|
return {}
|
|
return {"node_taints": {key: sorted(names) for key, names in taints.items()}}
|
|
|
|
|
|
def _build_root_disk_headroom(metrics: dict[str, Any]) -> dict[str, Any]:
|
|
node_usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
|
|
disk = node_usage.get("disk") if isinstance(node_usage.get("disk"), list) else []
|
|
if not disk:
|
|
return {}
|
|
entries = []
|
|
for entry in disk:
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
node = entry.get("node")
|
|
try:
|
|
used_pct = float(entry.get("value"))
|
|
except (TypeError, ValueError):
|
|
continue
|
|
headroom = max(0.0, 100.0 - used_pct)
|
|
if node:
|
|
entries.append({"node": node, "headroom_pct": headroom, "used_pct": used_pct})
|
|
entries.sort(key=lambda item: (item.get("headroom_pct") or 0.0, item.get("node") or ""))
|
|
return {"root_disk_low_headroom": entries[:5]} if entries else {}
|
|
|
|
|
|
def _build_longhorn(snapshot: dict[str, Any]) -> dict[str, Any]:
|
|
longhorn = snapshot.get("longhorn")
|
|
return {"longhorn": longhorn} if isinstance(longhorn, dict) and longhorn else {}
|
|
|
|
|
|
def _build_node_load(metrics: dict[str, Any]) -> dict[str, Any]:
|
|
node_load = metrics.get("node_load")
|
|
if not isinstance(node_load, list) or not node_load:
|
|
return {}
|
|
return {"node_load": node_load}
|
|
|
|
|
|
def _build_pods(metrics: dict[str, Any]) -> dict[str, Any]:
|
|
pods = {
|
|
"running": metrics.get("pods_running"),
|
|
"pending": metrics.get("pods_pending"),
|
|
"failed": metrics.get("pods_failed"),
|
|
"succeeded": metrics.get("pods_succeeded"),
|
|
}
|
|
if not any(value is not None for value in pods.values()):
|
|
return {}
|
|
return {"pods": pods}
|
|
|
|
|
|
def _build_capacity(metrics: dict[str, Any]) -> dict[str, Any]:
|
|
if not metrics:
|
|
return {}
|
|
capacity = {
|
|
"cpu": metrics.get("capacity_cpu"),
|
|
"allocatable_cpu": metrics.get("allocatable_cpu"),
|
|
"mem_bytes": metrics.get("capacity_mem_bytes"),
|
|
"allocatable_mem_bytes": metrics.get("allocatable_mem_bytes"),
|
|
"pods": metrics.get("capacity_pods"),
|
|
"allocatable_pods": metrics.get("allocatable_pods"),
|
|
}
|
|
if not any(value is not None for value in capacity.values()):
|
|
return {}
|
|
return {"capacity": capacity}
|
|
|
|
|
|
def _build_namespace_pods(snapshot: dict[str, Any]) -> dict[str, Any]:
|
|
namespaces = snapshot.get("namespace_pods")
|
|
if not isinstance(namespaces, list) or not namespaces:
|
|
return {}
|
|
return {"namespace_pods": namespaces}
|
|
|
|
|
|
def _build_namespace_nodes(snapshot: dict[str, Any]) -> dict[str, Any]:
|
|
namespace_nodes = snapshot.get("namespace_nodes")
|
|
if not isinstance(namespace_nodes, list) or not namespace_nodes:
|
|
return {}
|
|
return {"namespace_nodes": namespace_nodes}
|
|
|
|
|
|
def _build_node_pods(snapshot: dict[str, Any]) -> dict[str, Any]:
|
|
node_pods = snapshot.get("node_pods")
|
|
if not isinstance(node_pods, list) or not node_pods:
|
|
return {}
|
|
return {"node_pods": node_pods}
|
|
|
|
|
|
def _build_node_pods_top(metrics: dict[str, Any]) -> dict[str, Any]:
|
|
top = metrics.get("node_pods_top")
|
|
if not isinstance(top, list) or not top:
|
|
return {}
|
|
return {"node_pods_top": top}
|
|
|
|
|
|
def _build_pod_issues(snapshot: dict[str, Any]) -> dict[str, Any]:
|
|
pod_issues = snapshot.get("pod_issues")
|
|
if not isinstance(pod_issues, dict) or not pod_issues:
|
|
return {}
|
|
return {"pod_issues": pod_issues}
|
|
|
|
|
|
def _build_workload_health(snapshot: dict[str, Any]) -> dict[str, Any]:
|
|
health = snapshot.get("workloads_health")
|
|
if not isinstance(health, dict) or not health:
|
|
return {}
|
|
deployments = health.get("deployments")
|
|
statefulsets = health.get("statefulsets")
|
|
daemonsets = health.get("daemonsets")
|
|
if not isinstance(deployments, dict) or not isinstance(statefulsets, dict) or not isinstance(daemonsets, dict):
|
|
return {}
|
|
return {
|
|
"workloads_health": {
|
|
"deployments": deployments,
|
|
"statefulsets": statefulsets,
|
|
"daemonsets": daemonsets,
|
|
}
|
|
}
|
|
|
|
|
|
def _build_events(snapshot: dict[str, Any]) -> dict[str, Any]:
|
|
events = snapshot.get("events")
|
|
if not isinstance(events, dict) or not events:
|
|
return {}
|
|
return {"events": events}
|
|
|
|
|
|
def _build_event_summary(snapshot: dict[str, Any]) -> dict[str, Any]:
|
|
events = snapshot.get("events")
|
|
if not isinstance(events, dict) or not events:
|
|
return {}
|
|
summary = {}
|
|
if isinstance(events.get("warnings_top_reason"), dict):
|
|
summary["warnings_top_reason"] = events.get("warnings_top_reason")
|
|
if events.get("warnings_latest"):
|
|
summary["warnings_latest"] = events.get("warnings_latest")
|
|
return {"event_summary": summary} if summary else {}
|
|
|
|
|
|
def _build_postgres(metrics: dict[str, Any]) -> dict[str, Any]:
|
|
postgres = metrics.get("postgres_connections") if isinstance(metrics.get("postgres_connections"), dict) else {}
|
|
if not postgres:
|
|
return {}
|
|
return {
|
|
"postgres": {
|
|
"used": postgres.get("used"),
|
|
"max": postgres.get("max"),
|
|
"hottest_db": postgres.get("hottest_db"),
|
|
"by_db": postgres.get("by_db"),
|
|
}
|
|
}
|