297 lines
14 KiB
Python
297 lines
14 KiB
Python
from __future__ import annotations
|
|
|
|
from ariadne.services import cluster_state_anomalies as anomalies
|
|
from ariadne.services import cluster_state_attention as attention
|
|
from ariadne.services import cluster_state_health as health
|
|
from ariadne.services import cluster_state_profiles as profiles
|
|
from ariadne.services import cluster_state_relationships as relationships
|
|
from ariadne.services import cluster_state_signals as signals
|
|
from ariadne.services.cluster_state_contract import SignalContext
|
|
|
|
|
|
def test_relationship_context_and_cross_stats() -> None:
|
|
namespace_context = relationships._namespace_context(
|
|
[{"namespace": "apps", "pods_total": 4, "pods_running": 3, "primary_node": "titan-1"}],
|
|
[{"namespace": "apps", "nodes": {"titan-1": 3}, "primary_node": "titan-1"}],
|
|
[
|
|
{
|
|
"namespace": "apps",
|
|
"cpu_usage": 2.0,
|
|
"mem_usage": 4.0,
|
|
"cpu_usage_ratio": 1.5,
|
|
"mem_usage_ratio": 0.5,
|
|
}
|
|
],
|
|
{"apps": {"cpu": {"avg": 1.0}, "mem": {"avg": 2.0}}},
|
|
)
|
|
assert namespace_context[0]["baseline_delta"]["cpu"] == 100.0
|
|
assert relationships._namespace_nodes_top(namespace_context, 1)[0]["namespace"] == "apps"
|
|
|
|
workloads = [{"namespace": "apps", "workload": "api", "nodes": {"titan-1": 2}, "pods_total": 2}]
|
|
node_workloads = relationships._node_workload_map(workloads)
|
|
assert node_workloads["titan-1"] == {"apps/api": 2}
|
|
assert relationships._workload_nodes_top(workloads, 1)[0]["workload"] == "api"
|
|
assert relationships._node_workloads_top(node_workloads)[0]["node"] == "titan-1"
|
|
assert relationships._workload_index(workloads)[0]["workload"] == "api"
|
|
|
|
node_context = relationships._node_context(
|
|
[
|
|
{
|
|
"name": "titan-1",
|
|
"ready": True,
|
|
"hardware": "rpi5",
|
|
"arch": "arm64",
|
|
"roles": ["worker"],
|
|
}
|
|
],
|
|
[{"node": "titan-1", "cpu": 80.0, "ram": 40.0, "load_index": 0.9}],
|
|
{"titan-1": {"cpu": {"avg": 40.0}, "ram": {"avg": 20.0}}},
|
|
node_workloads,
|
|
)
|
|
assert node_context[0]["baseline_delta"]["cpu"] == 100.0
|
|
metrics = {
|
|
"node_usage": {"cpu": [{"node": "titan-1", "value": 80.0}]},
|
|
"namespace_top": {"cpu": [{"namespace": "apps", "value": 2.0}]},
|
|
"pvc_usage_top": [{"metric": {"namespace": "apps", "persistentvolumeclaim": "data"}, "value": 91.0}],
|
|
}
|
|
assert relationships._cross_node_metric_top(metrics, node_context)[0]["node"] == "titan-1"
|
|
assert relationships._cross_namespace_metric_top(metrics, namespace_context)[0]["namespace"] == "apps"
|
|
assert relationships._build_cross_stats(metrics, node_context, namespace_context, workloads)["node_metric_top"]
|
|
assert relationships._build_lexicon()["aliases"]["hot node"]
|
|
assert relationships._delta_top(node_context, "cpu")[0]["severity"] == "critical"
|
|
assert relationships._reason_top({"OOMKilled": 2})[0]["reason"] == "OOMKilled"
|
|
|
|
|
|
def test_relationship_edge_filters_and_baseline_helpers(monkeypatch) -> None:
|
|
assert relationships._vector_to_named([None, {"metric": {}, "value": 1}], "node", "node") == []
|
|
assert relationships._vector_to_named(
|
|
[{"metric": {"node": "titan-1"}, "value": 2}, {"metric": {"node": "titan-2"}, "value": 1}],
|
|
"node",
|
|
"node",
|
|
)[0]["node"] == "titan-1"
|
|
assert relationships._pvc_top([{"metric": {}}, {"metric": {"namespace": "apps", "persistentvolumeclaim": "data"}, "value": 90}]) == [
|
|
{"namespace": "apps", "pvc": "data", "used_percent": 90}
|
|
]
|
|
|
|
namespace_context = relationships._namespace_context(
|
|
[None, {"namespace": ""}, {"namespace": "apps", "pods_total": 1}],
|
|
[{"namespace": "apps", "nodes": "bad"}],
|
|
[{"namespace": "apps", "cpu_usage": "bad", "mem_usage": 2.0}],
|
|
"bad",
|
|
)
|
|
assert namespace_context[0]["namespace"] == "apps"
|
|
assert relationships._namespace_nodes_top([None, namespace_context[0]], 2)[0]["namespace"] == "apps"
|
|
|
|
workloads = [
|
|
None,
|
|
{"namespace": "apps", "nodes": {"titan-1": 1}},
|
|
{"namespace": "apps", "workload": "api", "nodes": "bad"},
|
|
{"namespace": "apps", "workload": "api", "nodes": {"": 1, "titan-1": "2", "titan-2": "bad", "titan-3": 0}},
|
|
{"workload": "solo", "nodes": {"titan-2": 1}},
|
|
]
|
|
node_workloads = relationships._node_workload_map(workloads)
|
|
assert node_workloads == {"titan-1": {"apps/api": 2}, "titan-2": {"solo": 1}}
|
|
assert relationships._node_workloads_top({"": {}, "titan-1": "bad", "titan-2": {"solo": 1}}, limit_nodes=2)[0]["node"] == "titan-2"
|
|
assert relationships._workload_index([{"workload": "api", "pods_total": 1, "nodes": "bad"}])[0]["nodes_top"] == []
|
|
|
|
assert relationships._events_summary("bad") == {}
|
|
events = {"warnings_total": 2, "warnings_by_namespace": {"apps": 2, "db": 1}, "warnings_recent": [1, 2]}
|
|
assert relationships._events_summary(events)["top_namespace"] == {"namespace": "apps", "count": 2}
|
|
|
|
assert relationships._top_named_entries([None, {"node": ""}, {"node": "n1", "value": "bad"}, {"node": "n2", "value": "2"}], "node", 3) == [
|
|
{"name": "n2", "value": 2.0},
|
|
{"name": "n1", "value": 0.0},
|
|
]
|
|
monkeypatch.setattr(relationships, "_top_named_entries", lambda *_args: [{"name": ""}])
|
|
assert relationships._cross_node_metric_top({"node_usage": {"cpu": [{}]}}, []) == []
|
|
assert relationships._cross_namespace_metric_top({"namespace_top": {"cpu": [{}]}}, []) == []
|
|
|
|
node_context = relationships._node_context(
|
|
[None, {"name": ""}, {"name": "titan-1", "pressure": ["DiskPressure"]}],
|
|
[{"node": "titan-1", "cpu": "bad", "load_index": 1.0}],
|
|
"bad",
|
|
"bad",
|
|
)
|
|
assert node_context[0]["node"] == "titan-1"
|
|
assert relationships._baseline_delta("bad", {"avg": 1}) is None
|
|
assert relationships._baseline_delta(1, {"avg": 0}) is None
|
|
assert relationships._delta_severity(50) == "warning"
|
|
assert relationships._delta_severity(5) == "info"
|
|
assert relationships._delta_entry_label({"namespace": "apps"}) == ("namespace", "apps")
|
|
assert relationships._delta_top([None, {"namespace": "apps", "baseline_delta": {"cpu": 5}}], "cpu")[0]["namespace"] == "apps"
|
|
assert relationships._reason_top({"": 1, "OOMKilled": "bad", "BackOff": 2}) == [{"reason": "BackOff", "count": 2}]
|
|
|
|
|
|
def test_health_anomaly_signal_profile_and_attention_domains() -> None:
|
|
metrics = {
|
|
"nodes_total": 2,
|
|
"nodes_ready": 1,
|
|
"pods_running": 8,
|
|
"pods_pending": 2,
|
|
"pods_failed": 1,
|
|
"job_failures_24h": [{"value": 1, "metric": {"job_name": "job"}}],
|
|
"pvc_usage_top": [{"metric": {"namespace": "apps", "persistentvolumeclaim": "data"}, "value": 92.0}],
|
|
"top_restarts_1h": [{"metric": {"namespace": "apps", "pod": "api"}, "value": 3}],
|
|
}
|
|
nodes_summary = {"pressure_nodes": {"DiskPressure": ["titan-1"]}, "unschedulable_nodes": ["titan-2"]}
|
|
workloads_health = {
|
|
"deployments": {"not_ready": 1, "items": [{"namespace": "apps", "name": "api", "desired": 2, "ready": 1}]},
|
|
"statefulsets": {"not_ready": 0, "items": []},
|
|
"daemonsets": {"not_ready": 0, "items": []},
|
|
}
|
|
pod_issues = {
|
|
"pending_over_15m": 2,
|
|
"counts": {"Failed": 1},
|
|
"waiting_reasons": {"CrashLoopBackOff": 3},
|
|
"phase_reasons": {"Evicted": 1},
|
|
}
|
|
kustomizations = {"not_ready": 1, "items": [{"name": "apps"}]}
|
|
events = {"warnings_total": 1, "warnings": [{"reason": "BackOff"}]}
|
|
|
|
anomaly_rows = anomalies._build_anomalies(metrics, nodes_summary, workloads_health, kustomizations, events)
|
|
assert {row["kind"] for row in anomaly_rows} >= {"pods_pending", "pvc_pressure", "flux_not_ready"}
|
|
assert anomalies._severity_rank("critical") == 0
|
|
assert anomalies._severity_rank("unknown") == 2
|
|
assert anomalies._pvc_pressure_signals(metrics)[0]["target"] == "apps/data"
|
|
assert anomalies._pvc_pressure_signals({"pvc_usage_top": [{"metric": {"namespace": "apps", "persistentvolumeclaim": "cold"}, "value": 79.0}]}) == []
|
|
without_optional = []
|
|
anomalies._append_node_anomalies(without_optional, {})
|
|
anomalies._append_event_anomalies(without_optional, {})
|
|
assert without_optional == []
|
|
|
|
bullets = health._health_bullets(metrics, nodes_summary, workloads_health, anomaly_rows)
|
|
assert bullets[0] == "Nodes ready: 1/2"
|
|
assert health._workload_not_ready_items(workloads_health)[0]["name"] == "api"
|
|
assert health._pod_restarts_top(metrics)[0]["pod"] == "api"
|
|
|
|
node_context = [
|
|
{
|
|
"node": "titan-1",
|
|
"ready": True,
|
|
"hardware": "rpi5",
|
|
"arch": "arm64",
|
|
"roles": ["worker"],
|
|
"cpu": 90.0,
|
|
"ram": 85.0,
|
|
"disk": 95.0,
|
|
"net": 50.0,
|
|
"io": 60.0,
|
|
"load_index": 0.95,
|
|
"baseline": {"net": {"max": 10.0}, "io": {"max": 20.0}},
|
|
"baseline_delta": {"cpu": 100.0},
|
|
"pressure_flags": ["DiskPressure"],
|
|
}
|
|
]
|
|
namespace_context = [
|
|
{
|
|
"namespace": "apps",
|
|
"pods_total": 4,
|
|
"pods_running": 3,
|
|
"primary_node": "titan-1",
|
|
"nodes_top": [("titan-1", 4)],
|
|
"cpu_usage": 2.0,
|
|
"mem_usage": 4.0,
|
|
"cpu_ratio": 1.5,
|
|
"mem_ratio": 0.5,
|
|
"baseline_delta": {"cpu": 100.0},
|
|
"baseline": {"cpu": {"avg": 1.0}},
|
|
}
|
|
]
|
|
context = SignalContext(metrics, node_context, namespace_context, workloads_health, pod_issues, kustomizations)
|
|
assert signals._pod_issue_summary(pod_issues, metrics)["waiting_reasons_top"][0]["reason"] == "CrashLoopBackOff"
|
|
assert signals._build_signals(context)
|
|
|
|
node_pods = [{"node": "titan-1", "pods_total": 4, "pods_running": 3, "namespaces_top": [("apps", 4)]}]
|
|
node_workloads = {"titan-1": {"apps/api": 2}}
|
|
workloads = [{"namespace": "apps", "workload": "api", "pods_total": 2, "pods_running": 1, "nodes": {"titan-1": 2}}]
|
|
built_profiles = profiles._build_profiles(node_context, namespace_context, node_pods, workloads, node_workloads)
|
|
assert built_profiles["nodes"][0]["node"] == "titan-1"
|
|
assert built_profiles["namespaces"][0]["namespace"] == "apps"
|
|
assert built_profiles["workloads"][0]["workload"] == "api"
|
|
|
|
ranked = attention._build_attention_ranked(metrics, node_context, pod_issues, workloads_health)
|
|
assert ranked[0]["score"] > 0
|
|
assert attention._node_attention_score(node_context[0])[0] > 0
|
|
|
|
|
|
def test_signal_helpers_filter_empty_and_malformed_inputs(monkeypatch) -> None:
|
|
assert signals._node_delta_signals([None, {"node": ""}, {"node": "titan-1", "baseline_delta": {"cpu": 5.0}}]) == []
|
|
assert signals._namespace_delta_signals([None, {"namespace": ""}, {"namespace": "apps", "baseline_delta": {"cpu": 5.0}}]) == []
|
|
assert signals._kustomization_signals({}) == []
|
|
assert signals._pod_issue_signals("bad") == []
|
|
monkeypatch.setattr(signals, "_workload_not_ready_items", lambda _health: [])
|
|
assert signals._workload_health_signals({"deployments": {"not_ready": 0}}) == []
|
|
|
|
|
|
def test_profile_builders_filter_bad_nodes_and_workload_nodes() -> None:
|
|
node_profiles = profiles._node_profiles(
|
|
[
|
|
None,
|
|
{"node": ""},
|
|
{"node": 123},
|
|
{"node": "titan-2", "load_index": 1.0},
|
|
{"node": "titan-1", "load_index": 2.0},
|
|
],
|
|
[{"node": "titan-1", "pods_total": 3, "pods_running": 2}],
|
|
{"titan-1": {"apps/api": 2, "apps/worker": 1}},
|
|
)
|
|
|
|
assert [profile["node"] for profile in node_profiles] == ["titan-1", "titan-2"]
|
|
assert node_profiles[0]["pods_total"] == 3
|
|
assert node_profiles[0]["workloads_top"] == [("apps/api", 2), ("apps/worker", 1)]
|
|
assert node_profiles[1]["namespaces_top"] == []
|
|
|
|
workload_profiles = profiles._workload_profiles(
|
|
[
|
|
None,
|
|
{"namespace": "apps", "workload": "api", "pods_total": 2, "nodes": {"titan-2": 1, "titan-1": 2}},
|
|
{"namespace": "apps", "workload": "bad", "pods_total": 1, "nodes": "not-a-map"},
|
|
]
|
|
)
|
|
|
|
assert workload_profiles[0]["nodes_top"] == [("titan-1", 2), ("titan-2", 1)]
|
|
assert workload_profiles[1]["nodes_top"] == []
|
|
|
|
|
|
def test_attention_filters_edge_entries(monkeypatch) -> None:
|
|
assert attention._node_attention_entries([None, {"node": ""}, {"node": "quiet"}]) == []
|
|
|
|
monkeypatch.setattr(attention, "_pvc_pressure_entries", lambda _metrics: [None, {"namespace": "apps", "pvc": "data", "used_percent": 90.0}])
|
|
assert attention._pvc_attention_entries({}) == [
|
|
{
|
|
"kind": "pvc",
|
|
"target": "apps/data",
|
|
"score": 2.0,
|
|
"reasons": ["usage 90.0%"],
|
|
}
|
|
]
|
|
|
|
|
|
def test_health_helpers_handle_clean_and_malformed_inputs() -> None:
|
|
bullets = health._health_bullets(
|
|
{"pods_running": 1, "pods_pending": 0, "pods_failed": 0},
|
|
{},
|
|
{"deployments": {"not_ready": 0}, "statefulsets": "bad", "daemonsets": {}},
|
|
[{"summary": ""}],
|
|
)
|
|
assert bullets == ["Pods: 1 running, 0 pending, 0 failed", "Workloads: all ready"]
|
|
|
|
workload_items = health._workload_not_ready_items(
|
|
{
|
|
"deployments": {"items": [None, {"namespace": "apps", "name": "api", "desired": 2, "ready": 1}]},
|
|
"statefulsets": {"items": "bad"},
|
|
}
|
|
)
|
|
assert workload_items == [{"kind": "deployment", "namespace": "apps", "name": "api", "desired": 2, "ready": 1}]
|
|
|
|
restarts = health._pod_restarts_top(
|
|
{
|
|
"top_restarts_1h": [
|
|
None,
|
|
{"metric": {"namespace": "apps"}, "value": 3},
|
|
{"metric": {"namespace": "apps", "pod": "api"}, "value": 2},
|
|
]
|
|
}
|
|
)
|
|
assert restarts == [{"namespace": "apps", "pod": "api", "value": 2}]
|