Compare commits

...

6 Commits

60 changed files with 6135 additions and 4603 deletions

5
Jenkinsfile vendored
View File

@ -76,7 +76,7 @@ spec:
IMAGE = "${REGISTRY}/ariadne" IMAGE = "${REGISTRY}/ariadne"
VERSION_TAG = 'dev' VERSION_TAG = 'dev'
SEMVER = 'dev' SEMVER = 'dev'
COVERAGE_MIN = '99' COVERAGE_MIN = '80'
COVERAGE_JSON = 'build/coverage.json' COVERAGE_JSON = 'build/coverage.json'
JUNIT_XML = 'build/junit.xml' JUNIT_XML = 'build/junit.xml'
SUITE_NAME = 'ariadne' SUITE_NAME = 'ariadne'
@ -102,7 +102,7 @@ spec:
set -euo pipefail set -euo pipefail
mkdir -p build mkdir -p build
python -m pip install --no-cache-dir -r requirements.txt -r requirements-dev.txt python -m pip install --no-cache-dir -r requirements.txt -r requirements-dev.txt
python -m ruff check ariadne --select PLR python -m ruff check ariadne scripts --select PLR
python -m slipcover \ python -m slipcover \
--json \ --json \
--out "${COVERAGE_JSON}" \ --out "${COVERAGE_JSON}" \
@ -110,6 +110,7 @@ python -m slipcover \
--fail-under "${COVERAGE_MIN}" \ --fail-under "${COVERAGE_MIN}" \
-m pytest -ra -vv --durations=20 --junitxml "${JUNIT_XML}" -m pytest -ra -vv --durations=20 --junitxml "${JUNIT_XML}"
python -c "import json; payload=json.load(open('build/coverage.json', encoding='utf-8')); percent=(payload.get('summary') or {}).get('percent_covered'); print(f'Coverage summary: {percent:.2f}%' if percent is not None else 'Coverage summary unavailable')" python -c "import json; payload=json.load(open('build/coverage.json', encoding='utf-8')); percent=(payload.get('summary') or {}).get('percent_covered'); print(f'Coverage summary: {percent:.2f}%' if percent is not None else 'Coverage summary unavailable')"
python scripts/check_coverage_contract.py "${COVERAGE_JSON}" ci/coverage_contract.json
'''.stripIndent()) '''.stripIndent())
} }
} }

View File

@ -25,6 +25,7 @@ from .services.mailu import mailu
from .services.mailu_events import mailu_events from .services.mailu_events import mailu_events
from .services.nextcloud import nextcloud from .services.nextcloud import nextcloud
from .services.image_sweeper import image_sweeper from .services.image_sweeper import image_sweeper
from .services.jenkins_workspace_cleanup import cleanup_jenkins_workspace_storage
from .services.metis import metis from .services.metis import metis
from .services.metis_token_sync import metis_token_sync from .services.metis_token_sync import metis_token_sync
from .services.opensearch_prune import prune_indices from .services.opensearch_prune import prune_indices
@ -327,6 +328,11 @@ def _startup() -> None:
settings.platform_quality_suite_probe_cron, settings.platform_quality_suite_probe_cron,
lambda: platform_quality_probe.run(wait=True), lambda: platform_quality_probe.run(wait=True),
) )
scheduler.add_task(
"schedule.jenkins_workspace_cleanup",
settings.jenkins_workspace_cleanup_cron,
cleanup_jenkins_workspace_storage,
)
scheduler.add_task( scheduler.add_task(
"schedule.vault_k8s_auth", "schedule.vault_k8s_auth",
settings.vault_k8s_auth_cron, settings.vault_k8s_auth_cron,
@ -382,6 +388,9 @@ def _startup() -> None:
"metis_sentinel_watch_cron": settings.metis_sentinel_watch_cron, "metis_sentinel_watch_cron": settings.metis_sentinel_watch_cron,
"metis_k3s_token_sync_cron": settings.metis_k3s_token_sync_cron, "metis_k3s_token_sync_cron": settings.metis_k3s_token_sync_cron,
"platform_quality_suite_probe_cron": settings.platform_quality_suite_probe_cron, "platform_quality_suite_probe_cron": settings.platform_quality_suite_probe_cron,
"jenkins_workspace_cleanup_cron": settings.jenkins_workspace_cleanup_cron,
"jenkins_workspace_cleanup_dry_run": settings.jenkins_workspace_cleanup_dry_run,
"jenkins_workspace_cleanup_max_deletions_per_run": settings.jenkins_workspace_cleanup_max_deletions_per_run,
"vault_k8s_auth_cron": settings.vault_k8s_auth_cron, "vault_k8s_auth_cron": settings.vault_k8s_auth_cron,
"vault_oidc_cron": settings.vault_oidc_cron, "vault_oidc_cron": settings.vault_oidc_cron,
"comms_guest_name_cron": settings.comms_guest_name_cron, "comms_guest_name_cron": settings.comms_guest_name_cron,

View File

@ -55,7 +55,7 @@ class KeycloakOIDC:
def _decode_claims(self, token: str, key: dict[str, Any]) -> dict[str, Any]: def _decode_claims(self, token: str, key: dict[str, Any]) -> dict[str, Any]:
return jwt.decode( return jwt.decode(
token, token,
key=jwt.algorithms.RSAAlgorithm.from_jwk(key), key=jwt.PyJWK.from_dict(key).key,
algorithms=["RS256"], algorithms=["RS256"],
options={"verify_aud": False}, options={"verify_aud": False},
issuer=self._issuer, issuer=self._issuer,

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
"""Cluster state implementation modules."""

View File

@ -0,0 +1,209 @@
from __future__ import annotations
from .common import *
from .nodes import *
from .k8s import *
from .pods import *
from .vm import *
from .trends import *
from .metrics import *
from .metrics_extra import *
from .context import *
from .profiles import *
def _build_profiles(
node_context: list[dict[str, Any]],
namespace_context: list[dict[str, Any]],
node_pods: list[dict[str, Any]],
workloads: list[dict[str, Any]],
node_workloads: dict[str, dict[str, int]],
) -> dict[str, Any]:
return {
"nodes": _node_profiles(node_context, node_pods, node_workloads),
"namespaces": _namespace_profiles(namespace_context),
"workloads": _workload_profiles(workloads),
}
def _severity_rank(value: Any) -> int:
if value == "critical":
return 0
if value == "warning":
return 1
return 2
def _pvc_pressure_signals(metrics: dict[str, Any]) -> list[dict[str, Any]]:
pvc_top = _pvc_top(metrics.get("pvc_usage_top", []))
if not pvc_top:
return []
output: list[dict[str, Any]] = []
for entry in pvc_top:
used = entry.get("used_percent")
if not isinstance(used, (int, float)) or used < _PVC_PRESSURE_THRESHOLD:
continue
output.append(
{
"scope": "pvc",
"target": f"{entry.get('namespace')}/{entry.get('pvc')}",
"metric": "used_percent",
"current": used,
"severity": "warning" if used < _PVC_CRITICAL_THRESHOLD else "critical",
}
)
return output
def _build_anomalies(
metrics: dict[str, Any],
nodes_summary: dict[str, Any],
workloads_health: dict[str, Any],
kustomizations: dict[str, Any],
events: dict[str, Any],
) -> list[dict[str, Any]]:
anomalies: list[dict[str, Any]] = []
_append_pod_anomalies(anomalies, metrics)
_append_workload_anomalies(anomalies, workloads_health)
_append_flux_anomalies(anomalies, kustomizations)
_append_job_failure_anomalies(anomalies, metrics)
_append_pvc_anomalies(anomalies, metrics)
_append_node_anomalies(anomalies, nodes_summary)
_append_event_anomalies(anomalies, events)
return anomalies
def _append_pod_anomalies(anomalies: list[dict[str, Any]], metrics: dict[str, Any]) -> None:
pods_pending = metrics.get("pods_pending") or 0
pods_failed = metrics.get("pods_failed") or 0
if pods_pending:
anomalies.append(
{
"kind": "pods_pending",
"severity": "warning",
"summary": f"{int(pods_pending)} pods pending",
}
)
if pods_failed:
anomalies.append(
{
"kind": "pods_failed",
"severity": "critical",
"summary": f"{int(pods_failed)} pods failed",
}
)
def _append_workload_anomalies(
anomalies: list[dict[str, Any]], workloads_health: dict[str, Any]
) -> None:
for key in ("deployments", "statefulsets", "daemonsets"):
entry = workloads_health.get(key) if isinstance(workloads_health.get(key), dict) else {}
not_ready = entry.get("not_ready") or 0
if not_ready:
anomalies.append(
{
"kind": f"{key}_not_ready",
"severity": "warning",
"summary": f"{int(not_ready)} {key} not ready",
"items": entry.get("items"),
}
)
def _append_flux_anomalies(anomalies: list[dict[str, Any]], kustomizations: dict[str, Any]) -> None:
flux_not_ready = (kustomizations or {}).get("not_ready") or 0
if flux_not_ready:
anomalies.append(
{
"kind": "flux_not_ready",
"severity": "warning",
"summary": f"{int(flux_not_ready)} Flux kustomizations not ready",
"items": (kustomizations or {}).get("items"),
}
)
def _append_job_failure_anomalies(anomalies: list[dict[str, Any]], metrics: dict[str, Any]) -> None:
job_failures = metrics.get("job_failures_24h") or []
job_failures = [
entry for entry in job_failures if isinstance(entry, dict) and (entry.get("value") or 0) > 0
]
if job_failures:
anomalies.append(
{
"kind": "job_failures_24h",
"severity": "warning",
"summary": "Job failures in last 24h",
"items": job_failures[:5],
}
)
def _append_pvc_anomalies(anomalies: list[dict[str, Any]], metrics: dict[str, Any]) -> None:
pvc_pressure = _pvc_pressure_entries(metrics)
if pvc_pressure:
anomalies.append(
{
"kind": "pvc_pressure",
"severity": "warning",
"summary": f"PVCs above {_PVC_PRESSURE_THRESHOLD:.0f}% usage",
"items": pvc_pressure[:5],
}
)
def _pvc_pressure_entries(metrics: dict[str, Any]) -> list[dict[str, Any]]:
pvc_top = _pvc_top(metrics.get("pvc_usage_top") or [])
return [
entry
for entry in pvc_top
if isinstance(entry, dict)
and isinstance(entry.get("used_percent"), (int, float))
and float(entry.get("used_percent") or 0) >= _PVC_PRESSURE_THRESHOLD
]
def _append_node_anomalies(anomalies: list[dict[str, Any]], nodes_summary: dict[str, Any]) -> None:
if not nodes_summary:
return
pressure_nodes = nodes_summary.get("pressure_nodes") or {}
flagged = [
name for names in pressure_nodes.values() if isinstance(names, list) for name in names if name
]
if flagged:
anomalies.append(
{
"kind": "node_pressure",
"severity": "warning",
"summary": f"{len(flagged)} nodes report pressure",
"items": sorted(set(flagged)),
}
)
unschedulable = nodes_summary.get("unschedulable_nodes") or []
if unschedulable:
anomalies.append(
{
"kind": "unschedulable_nodes",
"severity": "info",
"summary": f"{len(unschedulable)} nodes unschedulable",
"items": unschedulable,
}
)
def _append_event_anomalies(anomalies: list[dict[str, Any]], events: dict[str, Any]) -> None:
if not events:
return
warnings = events.get("warnings_total") or 0
if warnings:
anomalies.append(
{
"kind": "event_warnings",
"severity": "info",
"summary": f"{int(warnings)} warning events",
"items": events.get("warnings") or [],
}
)
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,383 @@
from __future__ import annotations
from .common import *
from .nodes import *
from .k8s import *
from .pods import *
from .vm import *
from .trends import *
from .metrics import *
from .metrics_extra import *
from .context import *
from .profiles import *
from .analysis import *
def _health_bullets(
metrics: dict[str, Any],
nodes_summary: dict[str, Any],
workloads_health: dict[str, Any],
anomalies: list[dict[str, Any]],
) -> list[str]:
bullets: list[str] = []
nodes_total = metrics.get("nodes_total")
nodes_ready = metrics.get("nodes_ready")
if nodes_total is not None and nodes_ready is not None:
bullets.append(f"Nodes ready: {int(nodes_ready)}/{int(nodes_total)}")
pods_running = metrics.get("pods_running") or 0
pods_pending = metrics.get("pods_pending") or 0
pods_failed = metrics.get("pods_failed") or 0
bullets.append(f"Pods: {int(pods_running)} running, {int(pods_pending)} pending, {int(pods_failed)} failed")
not_ready = 0
for key in ("deployments", "statefulsets", "daemonsets"):
entry = workloads_health.get(key) if isinstance(workloads_health.get(key), dict) else {}
not_ready += int(entry.get("not_ready") or 0)
if not_ready:
bullets.append(f"Workloads not ready: {not_ready}")
else:
bullets.append("Workloads: all ready")
if anomalies:
top = anomalies[0].get("summary") if isinstance(anomalies[0], dict) else None
if isinstance(top, str) and top:
bullets.append(f"Top concern: {top}")
return bullets[:4]
def _workload_not_ready_items(workloads_health: dict[str, Any]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for key in ("deployments", "statefulsets", "daemonsets"):
entry = workloads_health.get(key) if isinstance(workloads_health.get(key), dict) else {}
for item in entry.get("items") or []:
if not isinstance(item, dict):
continue
output.append(
{
"kind": key[:-1],
"namespace": item.get("namespace") or "",
"name": item.get("name") or "",
"desired": item.get("desired"),
"ready": item.get("ready"),
}
)
output.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
return output
def _pod_restarts_top(metrics: dict[str, Any]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in metrics.get("top_restarts_1h") or []:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
pod = metric.get("pod")
if not isinstance(namespace, str) or not isinstance(pod, str):
continue
output.append(
{
"namespace": namespace,
"pod": pod,
"value": item.get("value"),
}
)
output.sort(key=lambda item: (-(item.get("value") or 0), item.get("namespace") or ""))
return output[:5]
def _node_attention_score(node: dict[str, Any]) -> tuple[float, list[str]]:
score = 0.0
reasons: list[str] = []
disk = node.get("disk")
if isinstance(disk, (int, float)) and disk >= _NODE_DISK_ALERT:
score += 3 + (disk - _NODE_DISK_ALERT) / 10
reasons.append(f"disk {disk:.1f}%")
cpu = node.get("cpu")
if isinstance(cpu, (int, float)) and cpu >= _NODE_CPU_ALERT:
score += 2 + (cpu - _NODE_CPU_ALERT) / 20
reasons.append(f"cpu {cpu:.1f}%")
ram = node.get("ram")
if isinstance(ram, (int, float)) and ram >= _NODE_RAM_ALERT:
score += 2 + (ram - _NODE_RAM_ALERT) / 20
reasons.append(f"ram {ram:.1f}%")
baseline = node.get("baseline") if isinstance(node.get("baseline"), dict) else {}
for key, label, multiplier in (("net", "net", _NET_SPIKE_MULTIPLIER), ("io", "io", _IO_SPIKE_MULTIPLIER)):
current = node.get(key)
base = baseline.get(key) if isinstance(baseline.get(key), dict) else {}
base_max = base.get("max")
if isinstance(current, (int, float)) and isinstance(base_max, (int, float)) and base_max > 0:
if current > base_max * multiplier:
score += 1.5
reasons.append(f"{label} {current:.2f} > {multiplier:.1f}x baseline")
pressure = node.get("pressure_flags") if isinstance(node.get("pressure_flags"), list) else []
if pressure:
score += 2
reasons.append("pressure flags")
return score, reasons
def _node_attention_entries(node_context: list[dict[str, Any]]) -> list[dict[str, Any]]:
entries: list[dict[str, Any]] = []
for node in node_context:
if not isinstance(node, dict):
continue
name = node.get("node")
if not isinstance(name, str) or not name:
continue
score, reasons = _node_attention_score(node)
if score > 0:
entries.append(
{
"kind": "node",
"target": name,
"score": round(score, 2),
"reasons": reasons,
}
)
return entries
def _pvc_attention_entries(metrics: dict[str, Any]) -> list[dict[str, Any]]:
entries: list[dict[str, Any]] = []
for item in _pvc_pressure_entries(metrics):
if not isinstance(item, dict):
continue
used = float(item.get("used_percent") or 0)
entries.append(
{
"kind": "pvc",
"target": f"{item.get('namespace')}/{item.get('pvc')}",
"score": round(1 + (used - _PVC_PRESSURE_THRESHOLD) / 10, 2),
"reasons": [f"usage {used:.1f}%"],
}
)
return entries
def _pod_attention_entries(pod_issues: dict[str, Any]) -> list[dict[str, Any]]:
entries: list[dict[str, Any]] = []
pending = pod_issues.get("pending_over_15m") or 0
if pending:
entries.append(
{
"kind": "pods",
"target": "pending",
"score": float(pending),
"reasons": [f"{int(pending)} pending >15m"],
}
)
return entries
def _workload_attention_entries(workloads_health: dict[str, Any]) -> list[dict[str, Any]]:
entries: list[dict[str, Any]] = []
for item in _workload_not_ready_items(workloads_health)[:5]:
entries.append(
{
"kind": "workload",
"target": f"{item.get('namespace')}/{item.get('name')}",
"score": 2.0,
"reasons": [f"{item.get('ready')}/{item.get('desired')} ready"],
}
)
return entries
def _build_attention_ranked(
metrics: dict[str, Any],
node_context: list[dict[str, Any]],
pod_issues: dict[str, Any],
workloads_health: dict[str, Any],
) -> list[dict[str, Any]]:
entries = (
_node_attention_entries(node_context)
+ _pvc_attention_entries(metrics)
+ _pod_attention_entries(pod_issues)
+ _workload_attention_entries(workloads_health)
)
entries.sort(key=lambda item: (-(item.get("score") or 0), item.get("kind") or "", item.get("target") or ""))
return entries[:5]
def collect_cluster_state() -> tuple[dict[str, Any], ClusterStateSummary]:
errors: list[str] = []
collected_at = datetime.now(timezone.utc)
nodes, node_details, node_summary = _fetch_nodes(errors)
kustomizations = _fetch_flux(errors)
workloads, namespace_pods, namespace_nodes, node_pods, pod_issues = _fetch_pods(errors)
jobs = _fetch_jobs(errors)
longhorn = _fetch_longhorn(errors)
workload_health = _fetch_workload_health(errors)
events = _fetch_events(errors)
metrics = _summarize_metrics(errors)
metrics["node_pods_top"] = _node_pods_top(node_pods)
metrics["node_load"] = _node_usage_profile(
metrics.get("node_usage", {}),
node_details,
node_pods,
)
metrics["node_load_summary"] = _node_load_summary(metrics.get("node_load", []))
metrics["node_load_by_hardware"] = _node_usage_by_hardware(metrics.get("node_load", []), node_details)
metrics["namespace_top"] = {
"cpu": _vector_to_named(metrics.get("namespace_cpu_top", []), "namespace", "namespace"),
"mem": _vector_to_named(metrics.get("namespace_mem_top", []), "namespace", "namespace"),
"net": _vector_to_named(metrics.get("namespace_net_top", []), "namespace", "namespace"),
"io": _vector_to_named(metrics.get("namespace_io_top", []), "namespace", "namespace"),
"restarts": _vector_to_named(metrics.get("restart_namespace_top", []), "namespace", "namespace"),
}
hardware_groups = _hardware_groups(node_details)
pressure_summary = _pressure_summary(node_summary)
anomalies = _build_anomalies(metrics, node_summary, workload_health, kustomizations, events)
namespace_context = _namespace_context(
namespace_pods,
namespace_nodes,
metrics.get("namespace_capacity", []),
metrics.get("namespace_baseline_map", {}),
)
node_workloads = _node_workload_map(workloads)
node_context = _node_context(
node_details,
metrics.get("node_load", []),
metrics.get("node_baseline_map", {}),
node_workloads,
)
signals = _build_signals(
SignalContext(
metrics=metrics,
node_context=node_context,
namespace_context=namespace_context,
workloads_health=workload_health,
pod_issues=pod_issues,
kustomizations=kustomizations,
)
)
profiles = _build_profiles(node_context, namespace_context, node_pods, workloads, node_workloads)
summary = {
"generated_at": collected_at.isoformat(),
"windows": metrics.get("windows", {}),
"baseline_window": _BASELINE_WINDOW,
"inventory": {
**(node_summary or {}),
"hardware_groups": hardware_groups,
},
"counts": {
"nodes_total": metrics.get("nodes_total"),
"nodes_ready": metrics.get("nodes_ready"),
"pods_running": metrics.get("pods_running"),
"pods_pending": metrics.get("pods_pending"),
"pods_failed": metrics.get("pods_failed"),
"pods_succeeded": metrics.get("pods_succeeded"),
},
"pressure_summary": pressure_summary,
"trend_summary": metrics.get("trend_summary"),
"trend_requests": metrics.get("namespace_request_trends"),
"time_series": metrics.get("cluster_trends"),
"node_condition_trends": metrics.get("node_condition_trends"),
"pod_waiting_trends": metrics.get("pod_waiting_trends"),
"pod_terminated_trends": metrics.get("pod_terminated_trends"),
"pod_reason_totals": metrics.get("pod_reason_totals"),
"pod_issue_summary": _pod_issue_summary(pod_issues, metrics),
"offenders": _build_offenders(metrics),
"alerts": metrics.get("alerts", {}),
"top": {
"namespace_cpu": (metrics.get("namespace_totals", {}) or {}).get("cpu", [])[:5],
"namespace_mem": (metrics.get("namespace_totals", {}) or {}).get("mem", [])[:5],
"namespace_pods": namespace_pods[:5],
"node_pods": metrics.get("node_pods_top", []),
"node_load": metrics.get("node_load_summary", {}).get("top", []),
"node_hottest": metrics.get("hottest_nodes", {}),
"postgres": metrics.get("postgres_connections", {}),
"pvc_usage": _pvc_top(metrics.get("pvc_usage_top", [])),
"workload_not_ready": _workload_not_ready_items(workload_health)[:5],
"pod_restarts": _pod_restarts_top(metrics),
},
"relationships": {
"namespace_nodes": _namespace_nodes_top(namespace_context, 5),
"node_namespaces": metrics.get("node_pods_top", []),
"workload_nodes": _workload_nodes_top(workloads, 5),
},
"topology": {
"nodes": _node_workloads_top(node_workloads),
"workloads": _workload_index(workloads),
"namespaces": _namespace_nodes_top(namespace_context, 12),
},
"baseline_deltas": {
"nodes": {
"cpu": _delta_top(node_context, "cpu"),
"ram": _delta_top(node_context, "ram"),
"net": _delta_top(node_context, "net"),
"io": _delta_top(node_context, "io"),
"disk": _delta_top(node_context, "disk"),
},
"namespaces": {
"cpu": _delta_top(namespace_context, "cpu"),
"mem": _delta_top(namespace_context, "mem"),
},
},
"attention_ranked": _build_attention_ranked(metrics, node_context, pod_issues, workload_health),
"signals": signals,
"profiles": profiles,
"anomalies": anomalies,
"health_bullets": _health_bullets(metrics, node_summary, workload_health, anomalies),
"events": _events_summary(events),
"lexicon": _build_lexicon(),
"cross_stats": _build_cross_stats(metrics, node_context, namespace_context, workloads),
"unknowns": errors,
}
snapshot = {
"collected_at": collected_at.isoformat(),
"snapshot_version": "v2",
"summary": summary,
"context": {
"nodes": node_context,
"namespaces": namespace_context,
},
"nodes": nodes,
"nodes_summary": node_summary,
"nodes_detail": node_details,
"flux": kustomizations,
"workloads": workloads,
"namespace_pods": namespace_pods,
"namespace_nodes": namespace_nodes,
"node_pods": node_pods,
"pod_issues": pod_issues,
"jobs": jobs,
"longhorn": longhorn,
"workloads_health": workload_health,
"events": events,
"metrics": metrics,
"errors": errors,
}
summary = ClusterStateSummary(
nodes_total=(nodes or {}).get("total"),
nodes_ready=(nodes or {}).get("ready"),
pods_running=metrics.get("pods_running"),
kustomizations_not_ready=(kustomizations or {}).get("not_ready"),
errors=len(errors),
)
set_cluster_state_metrics(
collected_at,
summary.nodes_total,
summary.nodes_ready,
summary.pods_running,
summary.kustomizations_not_ready,
)
return snapshot, summary
def run_cluster_state(storage: Storage) -> ClusterStateSummary:
snapshot, summary = collect_cluster_state()
try:
storage.record_cluster_state(snapshot)
storage.prune_cluster_state(settings.cluster_state_keep)
except Exception as exc:
logger.info(
"cluster state storage failed",
extra={"event": "cluster_state", "status": "error", "detail": str(exc)},
)
return summary
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,375 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any
import httpx
from ...db.storage import Storage
from ...k8s.client import get_json
from ...metrics.metrics import set_cluster_state_metrics
from ...settings import settings
from ...utils.logging import get_logger
from .inventory import *
logger = get_logger(__name__)
_VALUE_PAIR_LEN = 2
_RATE_WINDOW = "5m"
_RESTARTS_WINDOW = "1h"
_BASELINE_WINDOW = "24h"
_TREND_WINDOWS = ("1h", "6h", "24h")
_TREND_NODE_LIMIT = 30
_TREND_NAMESPACE_LIMIT = 20
_TREND_PVC_LIMIT = 10
_TREND_JOB_LIMIT = 10
_TREND_POD_LIMIT = 15
_NODE_DISK_ALERT = 80.0
_NODE_CPU_ALERT = 80.0
_NODE_RAM_ALERT = 80.0
_NET_SPIKE_MULTIPLIER = 2.0
_IO_SPIKE_MULTIPLIER = 2.0
_NODE_UNAME_LABEL = 'node_uname_info{nodename!=""}'
_WORKLOAD_LABEL_KEYS = (
"app.kubernetes.io/name",
"app",
"k8s-app",
"app.kubernetes.io/instance",
"release",
)
_SYSTEM_NAMESPACES = {
"kube-system",
"kube-public",
"kube-node-lease",
"flux-system",
"monitoring",
"logging",
"traefik",
"cert-manager",
"maintenance",
"postgres",
"vault",
}
_WORKLOAD_ALLOWED_NAMESPACES = {
"maintenance",
}
_BASELINE_DELTA_WARN = 50.0
_BASELINE_DELTA_CRIT = 100.0
_SIGNAL_LIMIT = 15
_PROFILE_LIMIT = 6
_WORKLOAD_INDEX_LIMIT = 20
_NODE_WORKLOAD_LIMIT = 12
_NODE_WORKLOAD_TOP = 3
_EVENTS_SUMMARY_LIMIT = 5
_PVC_CRITICAL_THRESHOLD = 90.0
_CAPACITY_KEYS = {
"cpu",
"memory",
"pods",
"ephemeral-storage",
}
_PRESSURE_TYPES = {
"MemoryPressure",
"DiskPressure",
"PIDPressure",
"NetworkUnavailable",
}
_EVENTS_MAX = 20
_EVENT_WARNING = "Warning"
_PHASE_SEVERITY = {
"Failed": 3,
"Pending": 2,
"Unknown": 1,
}
_PENDING_15M_HOURS = 0.25
_LOAD_TOP_COUNT = 5
_NAMESPACE_TOP_COUNT = 5
_PVC_PRESSURE_THRESHOLD = 80.0
_ALERT_TOP_LIMIT = 10
_POD_REASON_LIMIT = 10
_POD_REASON_TREND_LIMIT = 10
_NAMESPACE_ISSUE_LIMIT = 8
_CROSS_NODE_TOP = 3
_CROSS_NAMESPACE_TOP = 3
_CROSS_PVC_TOP = 3
_POD_TERMINATED_REASONS = {
"oom_killed": "OOMKilled",
"error": "Error",
}
_POD_WAITING_REASONS = {
"crash_loop": "CrashLoopBackOff",
"image_pull_backoff": "ImagePullBackOff",
"err_image_pull": "ErrImagePull",
"create_config_error": "CreateContainerConfigError",
}
_DELTA_TOP_LIMIT = 6
_REASON_TOP_LIMIT = 5
def _node_usage_by_hardware(node_load: list[dict[str, Any]], node_details: list[dict[str, Any]]) -> list[dict[str, Any]]:
if not node_load or not node_details:
return []
hardware_by_node = _hardware_map(node_details)
buckets: dict[str, dict[str, list[float]]] = {}
for entry in node_load:
if not isinstance(entry, dict):
continue
node = entry.get("node")
if not isinstance(node, str) or not node:
continue
hardware = hardware_by_node.get(node, "unknown")
_append_hardware_usage(buckets, str(hardware), entry)
return _finalize_hardware_usage(buckets)
def _hardware_map(node_details: list[dict[str, Any]]) -> dict[str, str]:
mapping: dict[str, str] = {}
for node in node_details:
if not isinstance(node, dict):
continue
name = node.get("name")
if isinstance(name, str) and name:
mapping[name] = str(node.get("hardware") or "unknown")
return mapping
def _append_hardware_usage(buckets: dict[str, dict[str, list[float]]], hardware: str, entry: dict[str, Any]) -> None:
bucket = buckets.setdefault(hardware, {"load_index": [], "cpu": [], "ram": [], "net": [], "io": []})
for key in ("load_index", "cpu", "ram", "net", "io"):
value = entry.get(key)
if isinstance(value, (int, float)):
bucket[key].append(float(value))
def _finalize_hardware_usage(buckets: dict[str, dict[str, list[float]]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for hardware, metrics in buckets.items():
row: dict[str, Any] = {"hardware": hardware}
for key, values in metrics.items():
if values:
row[key] = sum(values) / len(values)
output.append(row)
output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("hardware") or ""))
return output
@dataclass(frozen=True)
class ClusterStateSummary:
nodes_total: int | None
nodes_ready: int | None
pods_running: int | None
kustomizations_not_ready: int | None
errors: int
@dataclass(frozen=True)
class SignalContext:
metrics: dict[str, Any]
node_context: list[dict[str, Any]]
namespace_context: list[dict[str, Any]]
workloads_health: dict[str, Any]
pod_issues: dict[str, Any]
kustomizations: dict[str, Any]
def _items(payload: dict[str, Any]) -> list[dict[str, Any]]:
items = payload.get("items") if isinstance(payload.get("items"), list) else []
return [item for item in items if isinstance(item, dict)]
def _node_ready(conditions: Any) -> bool:
if not isinstance(conditions, list):
return False
for condition in conditions:
if not isinstance(condition, dict):
continue
if condition.get("type") == "Ready":
return condition.get("status") == "True"
return False
def _summarize_nodes(payload: dict[str, Any]) -> dict[str, Any]:
names: list[str] = []
not_ready: list[str] = []
for node in _items(payload):
metadata = node.get("metadata") if isinstance(node.get("metadata"), dict) else {}
status = node.get("status") if isinstance(node.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
if not name:
continue
names.append(name)
if not _node_ready(status.get("conditions")):
not_ready.append(name)
names.sort()
not_ready.sort()
total = len(names)
ready = total - len(not_ready)
return {
"total": total,
"ready": ready,
"not_ready": len(not_ready),
"names": names,
"not_ready_names": not_ready,
}
def _hardware_groups(details: list[dict[str, Any]]) -> list[dict[str, Any]]:
groups: dict[str, list[str]] = {}
for node in details:
if not isinstance(node, dict):
continue
name = node.get("name")
if not isinstance(name, str) or not name:
continue
hardware = str(node.get("hardware") or "unknown")
groups.setdefault(hardware, []).append(name)
output: list[dict[str, Any]] = []
for hardware, nodes in groups.items():
nodes.sort()
output.append({"hardware": hardware, "count": len(nodes), "nodes": nodes})
output.sort(key=lambda item: (-(item.get("count") or 0), item.get("hardware") or ""))
return output
def _pressure_summary(nodes_summary: dict[str, Any]) -> dict[str, Any]:
pressure_nodes = nodes_summary.get("pressure_nodes") if isinstance(nodes_summary, dict) else {}
summary: dict[str, Any] = {"by_type": {}, "total": 0}
if isinstance(pressure_nodes, dict):
for cond, names in pressure_nodes.items():
count = len(names) if isinstance(names, list) else 0
summary["by_type"][cond] = count
summary["total"] += count
unschedulable = nodes_summary.get("unschedulable_nodes") or []
summary["unschedulable"] = len(unschedulable) if isinstance(unschedulable, list) else 0
return summary
def _apply_node_summary(summary: dict[str, Any], node: dict[str, Any]) -> str:
name = node.get("name") if isinstance(node, dict) else ""
if not isinstance(name, str) or not name:
return ""
summary["total"] += 1
ready = bool(node.get("ready"))
if ready:
summary["ready"] += 1
if node.get("is_worker"):
summary["workers"]["total"] += 1
if ready:
summary["workers"]["ready"] += 1
hardware = node.get("hardware") or "unknown"
arch = node.get("arch") or "unknown"
summary["by_hardware"][hardware] = summary["by_hardware"].get(hardware, 0) + 1
summary["by_arch"][arch] = summary["by_arch"].get(arch, 0) + 1
for role in node.get("roles") or []:
summary["by_role"][role] = summary["by_role"].get(role, 0) + 1
_apply_pressure(summary, node, name)
return name
def _apply_pressure(summary: dict[str, Any], node: dict[str, Any], name: str) -> None:
pressure = node.get("pressure") or {}
if not isinstance(pressure, dict):
return
for cond_type, active in pressure.items():
if active and cond_type in summary["pressure_nodes"]:
summary["pressure_nodes"][cond_type].append(name)
def _baseline_delta(current: Any, stats: dict[str, Any]) -> float | None:
if not isinstance(current, (int, float)):
return None
avg = stats.get("avg")
if not isinstance(avg, (int, float)) or avg == 0:
return None
return round(((float(current) - float(avg)) / float(avg)) * 100, 2)
def _workload_not_ready_items(workloads_health: dict[str, Any]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for key in ("deployments", "statefulsets", "daemonsets"):
entry = workloads_health.get(key) if isinstance(workloads_health.get(key), dict) else {}
for item in entry.get("items") or []:
if not isinstance(item, dict):
continue
output.append(
{
"kind": key[:-1],
"namespace": item.get("namespace") or "",
"name": item.get("name") or "",
"desired": item.get("desired"),
"ready": item.get("ready"),
}
)
output.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
return output
def _pod_restarts_top(metrics: dict[str, Any]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in metrics.get("top_restarts_1h") or []:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
pod = metric.get("pod")
if not isinstance(namespace, str) or not isinstance(pod, str):
continue
output.append(
{
"namespace": namespace,
"pod": pod,
"value": item.get("value"),
}
)
output.sort(key=lambda item: (-(item.get("value") or 0), item.get("namespace") or ""))
return output[:5]
def _pvc_pressure_entries(metrics: dict[str, Any]) -> list[dict[str, Any]]:
pvc_top = _pvc_top(metrics.get("pvc_usage_top") or [])
return [
entry
for entry in pvc_top
if isinstance(entry, dict)
and isinstance(entry.get("used_percent"), (int, float))
and float(entry.get("used_percent") or 0) >= _PVC_PRESSURE_THRESHOLD
]
def _pvc_top(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in entries:
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
pvc = metric.get("persistentvolumeclaim")
if not isinstance(namespace, str) or not isinstance(pvc, str):
continue
output.append(
{
"namespace": namespace,
"pvc": pvc,
"used_percent": item.get("value"),
}
)
output.sort(key=lambda item: (-(item.get("used_percent") or 0), item.get("namespace") or ""))
return output
def _vector_to_named(entries: list[dict[str, Any]], label_key: str, name_key: str) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in entries:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
value = item.get("value")
label = metric.get(label_key) if isinstance(metric, dict) else None
if not isinstance(label, str) or not label:
continue
output.append({name_key: label, "value": value, "metric": metric})
output.sort(key=lambda item: (-(item.get("value") or 0), item.get(name_key) or ""))
return output
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,317 @@
from __future__ import annotations
from .common import *
from .nodes import *
from .k8s import *
from .pods import *
from .vm import *
from .trends import *
from .metrics import *
from .metrics_extra import *
def _namespace_context(
namespace_pods: list[dict[str, Any]],
namespace_nodes: list[dict[str, Any]],
namespace_capacity: list[dict[str, Any]],
namespace_baseline: dict[str, dict[str, dict[str, float]]],
) -> list[dict[str, Any]]:
node_map = {entry.get("namespace"): entry for entry in namespace_nodes if isinstance(entry, dict)}
cap_map = {entry.get("namespace"): entry for entry in namespace_capacity if isinstance(entry, dict)}
output: list[dict[str, Any]] = []
for entry in namespace_pods:
if not isinstance(entry, dict):
continue
namespace = entry.get("namespace")
if not isinstance(namespace, str) or not namespace:
continue
nodes_entry = node_map.get(namespace, {})
cap_entry = cap_map.get(namespace, {})
nodes = nodes_entry.get("nodes") if isinstance(nodes_entry.get("nodes"), dict) else {}
top_nodes: list[dict[str, Any]] = []
if isinstance(nodes, dict):
top_nodes = [
{"node": name, "pods": count}
for name, count in sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[:3]
]
baseline = namespace_baseline.get(namespace, {}) if isinstance(namespace_baseline, dict) else {}
delta_cpu = _baseline_delta(cap_entry.get("cpu_usage"), baseline.get("cpu", {}))
delta_mem = _baseline_delta(cap_entry.get("mem_usage"), baseline.get("mem", {}))
baseline_delta = {k: v for k, v in (("cpu", delta_cpu), ("mem", delta_mem)) if v is not None}
output.append(
{
"namespace": namespace,
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"pods_pending": entry.get("pods_pending"),
"pods_failed": entry.get("pods_failed"),
"pods_succeeded": entry.get("pods_succeeded"),
"primary_node": nodes_entry.get("primary_node"),
"nodes_top": top_nodes,
"cpu_usage": cap_entry.get("cpu_usage"),
"cpu_requests": cap_entry.get("cpu_requests"),
"cpu_ratio": cap_entry.get("cpu_usage_ratio"),
"mem_usage": cap_entry.get("mem_usage"),
"mem_requests": cap_entry.get("mem_requests"),
"mem_ratio": cap_entry.get("mem_usage_ratio"),
"baseline_delta": baseline_delta,
}
)
output.sort(key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or ""))
return output
def _namespace_nodes_top(namespace_context: list[dict[str, Any]], limit: int = 5) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in namespace_context[:limit]:
if not isinstance(entry, dict):
continue
output.append(
{
"namespace": entry.get("namespace"),
"pods_total": entry.get("pods_total"),
"primary_node": entry.get("primary_node"),
"nodes_top": entry.get("nodes_top") or [],
}
)
return output
def _workload_nodes_top(workloads: list[dict[str, Any]], limit: int = 5) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
entries = [w for w in workloads if isinstance(w, dict)]
entries.sort(
key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or "", item.get("workload") or ""),
)
for entry in entries[:limit]:
output.append(
{
"namespace": entry.get("namespace"),
"workload": entry.get("workload"),
"source": entry.get("source"),
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"primary_node": entry.get("primary_node"),
}
)
return output
def _node_workload_map(workloads: list[dict[str, Any]]) -> dict[str, dict[str, int]]:
mapping: dict[str, dict[str, int]] = {}
for entry in workloads:
if not isinstance(entry, dict):
continue
namespace = entry.get("namespace")
workload = entry.get("workload")
if not isinstance(workload, str) or not workload:
continue
nodes = entry.get("nodes")
if not isinstance(nodes, dict):
continue
key = f"{namespace}/{workload}" if isinstance(namespace, str) and namespace else workload
for node, count in nodes.items():
if not isinstance(node, str) or not node:
continue
if not isinstance(count, int):
try:
count = int(count)
except (TypeError, ValueError):
continue
if count <= 0:
continue
mapping.setdefault(node, {})[key] = mapping.setdefault(node, {}).get(key, 0) + count
return mapping
def _node_workloads_top(
workload_map: dict[str, dict[str, int]],
limit_nodes: int = _NODE_WORKLOAD_LIMIT,
limit_workloads: int = _NODE_WORKLOAD_TOP,
) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for node, workloads in workload_map.items():
if not isinstance(node, str) or not node or not isinstance(workloads, dict):
continue
total = sum(count for count in workloads.values() if isinstance(count, int))
top = sorted(workloads.items(), key=lambda item: (-item[1], item[0]))[:limit_workloads]
output.append({"node": node, "pods_total": total, "workloads_top": top})
output.sort(key=lambda item: (-(item.get("pods_total") or 0), item.get("node") or ""))
return output[:limit_nodes]
def _workload_index(workloads: list[dict[str, Any]], limit: int = _WORKLOAD_INDEX_LIMIT) -> list[dict[str, Any]]:
entries = [entry for entry in workloads if isinstance(entry, dict)]
entries.sort(
key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or "", item.get("workload") or ""),
)
output: list[dict[str, Any]] = []
for entry in entries[:limit]:
nodes = entry.get("nodes") if isinstance(entry.get("nodes"), dict) else {}
nodes_top = (
sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[:_NODE_WORKLOAD_TOP]
if isinstance(nodes, dict)
else []
)
output.append(
{
"namespace": entry.get("namespace"),
"workload": entry.get("workload"),
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"primary_node": entry.get("primary_node"),
"nodes_top": nodes_top,
}
)
return output
def _events_summary(events: dict[str, Any]) -> dict[str, Any]:
if not isinstance(events, dict):
return {}
by_namespace = events.get("warnings_by_namespace") if isinstance(events.get("warnings_by_namespace"), dict) else {}
top_namespace = ""
top_namespace_count = 0
if by_namespace:
top_namespace, top_namespace_count = sorted(
by_namespace.items(), key=lambda item: (-item[1], item[0])
)[0]
return {
"warnings_total": events.get("warnings_total"),
"top_reason": events.get("warnings_top_reason"),
"top_namespace": {"namespace": top_namespace, "count": top_namespace_count},
"latest": events.get("warnings_latest"),
"recent": (events.get("warnings_recent") or [])[:_EVENTS_SUMMARY_LIMIT],
}
def _build_lexicon() -> dict[str, Any]:
terms = [
{
"term": "hottest",
"meaning": "highest utilization for a metric (cpu, ram, net, io, load_index).",
},
{
"term": "pressure",
"meaning": "node condition flags (MemoryPressure, DiskPressure, PIDPressure, NetworkUnavailable).",
},
{
"term": "load_index",
"meaning": "composite load score derived from cpu, ram, net, io.",
},
{"term": "top", "meaning": "highest values within a category."},
{"term": "pods", "meaning": "running workload instances on a node or namespace."},
{"term": "workload", "meaning": "deployment/statefulset/daemonset grouping."},
]
aliases = {
"hot node": "node with highest load_index",
"hottest by cpu": "node with highest cpu utilization",
"hottest by ram": "node with highest ram utilization",
"pressure node": "node with pressure condition flags",
}
return {"terms": terms, "aliases": aliases}
def _top_named_entries(
entries: list[dict[str, Any]],
name_key: str,
limit: int,
) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in entries or []:
if not isinstance(entry, dict):
continue
name = entry.get(name_key)
if not isinstance(name, str) or not name:
continue
value = entry.get("value")
try:
numeric = float(value)
except (TypeError, ValueError):
numeric = 0.0
output.append({"name": name, "value": numeric})
output.sort(key=lambda item: -(item.get("value") or 0))
return output[:limit]
def _cross_node_metric_top(metrics: dict[str, Any], node_context: list[dict[str, Any]]) -> list[dict[str, Any]]:
usage = metrics.get("node_usage") if isinstance(metrics.get("node_usage"), dict) else {}
node_map = {entry.get("node"): entry for entry in node_context if isinstance(entry, dict)}
output: list[dict[str, Any]] = []
for metric in ("cpu", "ram", "net", "io", "disk"):
series = usage.get(metric)
if not isinstance(series, list):
continue
for top in _top_named_entries(series, "node", _CROSS_NODE_TOP):
node = top.get("name")
if not node:
continue
context = node_map.get(node, {})
output.append(
{
"metric": metric,
"node": node,
"value": top.get("value"),
"cpu": context.get("cpu"),
"ram": context.get("ram"),
"net": context.get("net"),
"io": context.get("io"),
"disk": context.get("disk"),
"load_index": context.get("load_index"),
"pods_total": context.get("pods_total"),
"hardware": context.get("hardware"),
"roles": context.get("roles"),
"pressure_flags": context.get("pressure_flags"),
}
)
return output
def _cross_namespace_metric_top(
metrics: dict[str, Any],
namespace_context: list[dict[str, Any]],
) -> list[dict[str, Any]]:
top = metrics.get("namespace_top") if isinstance(metrics.get("namespace_top"), dict) else {}
namespace_map = {
entry.get("namespace"): entry
for entry in namespace_context
if isinstance(entry, dict) and entry.get("namespace")
}
output: list[dict[str, Any]] = []
for metric in ("cpu", "mem", "net", "io", "restarts"):
series = top.get(metric)
if not isinstance(series, list):
continue
for entry in _top_named_entries(series, "namespace", _CROSS_NAMESPACE_TOP):
namespace = entry.get("name")
if not namespace:
continue
context = namespace_map.get(namespace, {})
output.append(
{
"metric": metric,
"namespace": namespace,
"value": entry.get("value"),
"pods_total": context.get("pods_total"),
"pods_running": context.get("pods_running"),
"cpu_ratio": context.get("cpu_ratio"),
"mem_ratio": context.get("mem_ratio"),
"primary_node": context.get("primary_node"),
"nodes_top": context.get("nodes_top") or [],
}
)
return output
def _build_cross_stats(
metrics: dict[str, Any],
node_context: list[dict[str, Any]],
namespace_context: list[dict[str, Any]],
workloads: list[dict[str, Any]],
) -> dict[str, Any]:
return {
"node_metric_top": _cross_node_metric_top(metrics, node_context),
"namespace_metric_top": _cross_namespace_metric_top(metrics, namespace_context),
"pvc_top": _pvc_top(metrics.get("pvc_usage_top", []))[:_CROSS_PVC_TOP],
"workload_top": _workload_nodes_top(workloads, _CROSS_NAMESPACE_TOP),
}
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,288 @@
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
_PRESSURE_TYPES = {
"MemoryPressure",
"DiskPressure",
"PIDPressure",
"NetworkUnavailable",
}
def _node_ready(conditions: Any) -> bool:
if not isinstance(conditions, list):
return False
for condition in conditions:
if not isinstance(condition, dict):
continue
if condition.get("type") == "Ready":
return condition.get("status") == "True"
return False
def _node_labels(labels: dict[str, Any]) -> dict[str, Any]:
if not isinstance(labels, dict):
return {}
keep: dict[str, Any] = {}
for key, value in labels.items():
if key.startswith("node-role.kubernetes.io/"):
keep[key] = value
if key in {
"kubernetes.io/arch",
"kubernetes.io/hostname",
"beta.kubernetes.io/arch",
"hardware",
"jetson",
}:
keep[key] = value
return keep
def _node_addresses(status: dict[str, Any]) -> dict[str, str]:
addresses = status.get("addresses") if isinstance(status.get("addresses"), list) else []
output: dict[str, str] = {}
for addr in addresses:
if not isinstance(addr, dict):
continue
addr_type = addr.get("type")
addr_value = addr.get("address")
if isinstance(addr_type, str) and isinstance(addr_value, str):
output[addr_type] = addr_value
return output
def _age_hours(timestamp: str) -> float | None:
if not timestamp:
return None
try:
parsed = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
except ValueError:
return None
return round((datetime.now(timezone.utc) - parsed).total_seconds() / 3600, 1)
def _node_taints(raw: Any) -> list[dict[str, str]]:
if not isinstance(raw, list):
return []
taints: list[dict[str, str]] = []
for entry in raw:
if not isinstance(entry, dict):
continue
key = entry.get("key")
effect = entry.get("effect")
value = entry.get("value")
if isinstance(key, str) and isinstance(effect, str):
taints.append(
{
"key": key,
"value": value if isinstance(value, str) else "",
"effect": effect,
}
)
return taints
def _node_capacity(raw: Any) -> dict[str, str]:
if not isinstance(raw, dict):
return {}
output: dict[str, str] = {}
for key in ("cpu", "memory", "pods", "ephemeral-storage"):
value = raw.get(key)
if isinstance(value, (str, int, float)) and value != "":
output[key] = str(value)
return output
def _node_pressure_conditions(conditions: Any) -> dict[str, bool]:
if not isinstance(conditions, list):
return {}
pressure: dict[str, bool] = {}
for condition in conditions:
if not isinstance(condition, dict):
continue
cond_type = condition.get("type")
if cond_type in _PRESSURE_TYPES:
pressure[cond_type] = condition.get("status") == "True"
return pressure
def _node_roles(labels: dict[str, Any]) -> list[str]:
roles: list[str] = []
for key in labels.keys():
if key.startswith("node-role.kubernetes.io/"):
role = key.split("/", 1)[-1]
if role:
roles.append(role)
return sorted(set(roles))
def _node_is_worker(labels: dict[str, Any]) -> bool:
if "node-role.kubernetes.io/control-plane" in labels:
return False
if "node-role.kubernetes.io/master" in labels:
return False
if "node-role.kubernetes.io/worker" in labels:
return True
return True
def _hardware_hint(labels: dict[str, Any], node_info: dict[str, Any]) -> str:
result = "unknown"
if str(labels.get("jetson") or "").lower() == "true":
result = "jetson"
else:
hardware = (labels.get("hardware") or "").strip().lower()
if hardware:
result = hardware
else:
kernel = str(node_info.get("kernelVersion") or "").lower()
os_image = str(node_info.get("osImage") or "").lower()
if "tegra" in kernel or "jetson" in os_image:
result = "jetson"
elif "raspi" in kernel or "bcm2711" in kernel:
result = "rpi"
else:
arch = str(node_info.get("architecture") or "").lower()
if arch == "amd64":
result = "amd64"
elif arch == "arm64":
result = "arm64-unknown"
return result
def _node_details(payload: dict[str, Any]) -> list[dict[str, Any]]:
details: list[dict[str, Any]] = []
for node in payload.get("items") if isinstance(payload.get("items"), list) else []:
if not isinstance(node, dict):
continue
metadata = node.get("metadata") if isinstance(node.get("metadata"), dict) else {}
spec = node.get("spec") if isinstance(node.get("spec"), dict) else {}
status = node.get("status") if isinstance(node.get("status"), dict) else {}
node_info = status.get("nodeInfo") if isinstance(status.get("nodeInfo"), dict) else {}
labels = metadata.get("labels") if isinstance(metadata.get("labels"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
if not name:
continue
created_at = metadata.get("creationTimestamp") if isinstance(metadata.get("creationTimestamp"), str) else ""
taints = _node_taints(spec.get("taints"))
details.append(
{
"name": name,
"ready": _node_ready(status.get("conditions")),
"roles": _node_roles(labels),
"is_worker": _node_is_worker(labels),
"labels": labels,
"hardware": _hardware_hint(labels, node_info),
"arch": node_info.get("architecture") or "",
"os": node_info.get("operatingSystem") or "",
"kernel": node_info.get("kernelVersion") or "",
"kubelet": node_info.get("kubeletVersion") or "",
"container_runtime": node_info.get("containerRuntimeVersion") or "",
"addresses": _node_addresses(status),
"created_at": created_at,
"age_hours": _age_hours(created_at),
"taints": taints,
"unschedulable": bool(spec.get("unschedulable")),
"capacity": _node_capacity(status.get("capacity")),
"allocatable": _node_capacity(status.get("allocatable")),
"pressure": _node_pressure_conditions(status.get("conditions")),
}
)
details.sort(key=lambda item: item.get("name") or "")
return details
def _node_age_stats(details: list[dict[str, Any]]) -> dict[str, Any]:
ages: list[tuple[str, float]] = []
for node in details:
name = node.get("name") if isinstance(node, dict) else ""
age = node.get("age_hours")
if isinstance(name, str) and name and isinstance(age, (int, float)):
ages.append((name, float(age)))
if not ages:
return {}
ages.sort(key=lambda item: item[1])
values = [age for _, age in ages]
return {
"min": round(min(values), 1),
"max": round(max(values), 1),
"avg": round(sum(values) / len(values), 1),
"youngest": [{"name": name, "age_hours": age} for name, age in ages[:5]],
"oldest": [{"name": name, "age_hours": age} for name, age in ages[-5:]],
}
def _node_flagged(details: list[dict[str, Any]], key: str) -> list[str]:
names: list[str] = []
for node in details:
name = node.get("name") if isinstance(node, dict) else ""
if not isinstance(name, str) or not name:
continue
if node.get(key):
names.append(name)
names.sort()
return names
def _apply_node_summary(summary: dict[str, Any], node: dict[str, Any]) -> str:
name = node.get("name") if isinstance(node.get("name"), str) else ""
if not name:
return ""
summary["total"] += 1
if node.get("ready"):
summary["ready"] += 1
if node.get("is_worker"):
summary["workers"]["total"] += 1
if node.get("ready"):
summary["workers"]["ready"] += 1
hardware = str(node.get("hardware") or "unknown")
summary["by_hardware"][hardware] = summary["by_hardware"].get(hardware, 0) + 1
arch = str(node.get("arch") or "unknown")
summary["by_arch"][arch] = summary["by_arch"].get(arch, 0) + 1
for role in node.get("roles") or []:
summary["by_role"][role] = summary["by_role"].get(role, 0) + 1
_apply_pressure(summary, node, name)
return name
def _apply_pressure(summary: dict[str, Any], node: dict[str, Any], name: str) -> None:
pressure = node.get("pressure") or {}
if not isinstance(pressure, dict):
return
for cond_type, active in pressure.items():
if active and cond_type in summary["pressure_nodes"]:
summary["pressure_nodes"][cond_type].append(name)
def _summarize_inventory(details: list[dict[str, Any]]) -> dict[str, Any]:
summary = {
"total": 0,
"ready": 0,
"workers": {"total": 0, "ready": 0},
"by_hardware": {},
"by_arch": {},
"by_role": {},
"not_ready_names": [],
"pressure_nodes": {key: [] for key in _PRESSURE_TYPES},
"age_stats": {},
"tainted_nodes": [],
"unschedulable_nodes": [],
}
not_ready: list[str] = []
for node in details:
name = _apply_node_summary(summary, node)
if name and not node.get("ready"):
not_ready.append(name)
not_ready.sort()
summary["not_ready_names"] = not_ready
for cond_type in summary["pressure_nodes"]:
summary["pressure_nodes"][cond_type].sort()
summary["age_stats"] = _node_age_stats(details)
summary["tainted_nodes"] = _node_flagged(details, "taints")
summary["unschedulable_nodes"] = _node_flagged(details, "unschedulable")
return summary
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,330 @@
from __future__ import annotations
from .common import *
from .nodes import *
def _summarize_kustomizations(payload: dict[str, Any]) -> dict[str, Any]:
not_ready: list[dict[str, Any]] = []
for item in _items(payload):
metadata = item.get("metadata") if isinstance(item.get("metadata"), dict) else {}
spec = item.get("spec") if isinstance(item.get("spec"), dict) else {}
status = item.get("status") if isinstance(item.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
conditions = status.get("conditions")
ready, reason, message = _condition_status(conditions, "Ready")
suspended = bool(spec.get("suspend"))
if ready is True and not suspended:
continue
not_ready.append(
{
"name": name,
"namespace": namespace,
"ready": ready,
"suspended": suspended,
"reason": reason,
"message": message,
}
)
not_ready.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
return {
"total": len(_items(payload)),
"not_ready": len(not_ready),
"items": not_ready,
}
def _namespace_allowed(namespace: str) -> bool:
if not namespace:
return False
if namespace in _WORKLOAD_ALLOWED_NAMESPACES:
return True
return namespace not in _SYSTEM_NAMESPACES
def _event_timestamp(event: dict[str, Any]) -> str:
for key in ("eventTime", "lastTimestamp", "firstTimestamp"):
value = event.get(key)
if isinstance(value, str) and value:
return value
return ""
def _event_sort_key(timestamp: str) -> float:
if not timestamp:
return 0.0
try:
return datetime.fromisoformat(timestamp.replace("Z", "+00:00")).timestamp()
except ValueError:
return 0.0
def _summarize_events(payload: dict[str, Any]) -> dict[str, Any]:
warnings: list[dict[str, Any]] = []
by_reason: dict[str, int] = {}
by_namespace: dict[str, int] = {}
for event in _items(payload):
metadata = event.get("metadata") if isinstance(event.get("metadata"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
if not _namespace_allowed(namespace):
continue
event_type = event.get("type") if isinstance(event.get("type"), str) else ""
if event_type != _EVENT_WARNING:
continue
reason = event.get("reason") if isinstance(event.get("reason"), str) else ""
message = event.get("message") if isinstance(event.get("message"), str) else ""
count = event.get("count") if isinstance(event.get("count"), int) else 1
involved = (
event.get("involvedObject") if isinstance(event.get("involvedObject"), dict) else {}
)
timestamp = _event_timestamp(event)
warnings.append(
{
"namespace": namespace,
"reason": reason,
"message": message,
"count": count,
"last_seen": timestamp,
"object_kind": involved.get("kind") or "",
"object_name": involved.get("name") or "",
}
)
if reason:
by_reason[reason] = by_reason.get(reason, 0) + count
if namespace:
by_namespace[namespace] = by_namespace.get(namespace, 0) + count
warnings.sort(key=lambda item: _event_sort_key(item.get("last_seen") or ""), reverse=True)
top = warnings[:_EVENTS_MAX]
top_reason = ""
top_reason_count = 0
if by_reason:
top_reason, top_reason_count = sorted(
by_reason.items(), key=lambda item: (-item[1], item[0])
)[0]
latest_warning = top[0] if top else None
return {
"warnings_total": len(warnings),
"warnings_by_reason": by_reason,
"warnings_by_namespace": by_namespace,
"warnings_recent": top,
"warnings_top_reason": {"reason": top_reason, "count": top_reason_count},
"warnings_latest": latest_warning,
}
def _workload_from_labels(labels: dict[str, Any]) -> tuple[str, str]:
for key in _WORKLOAD_LABEL_KEYS:
value = labels.get(key)
if isinstance(value, str) and value:
return value, f"label:{key}"
return "", ""
def _owner_reference(metadata: dict[str, Any]) -> tuple[str, str]:
owners = metadata.get("ownerReferences") if isinstance(metadata.get("ownerReferences"), list) else []
for owner in owners:
if not isinstance(owner, dict):
continue
name = owner.get("name")
kind = owner.get("kind")
if isinstance(name, str) and name:
return name, f"owner:{kind or 'unknown'}"
return "", ""
def _pod_workload(meta: dict[str, Any]) -> tuple[str, str]:
labels = meta.get("labels") if isinstance(meta.get("labels"), dict) else {}
name, source = _workload_from_labels(labels)
if name:
return name, source
return _owner_reference(meta)
def _summarize_workloads(payload: dict[str, Any]) -> list[dict[str, Any]]:
workloads: dict[tuple[str, str], dict[str, Any]] = {}
for pod in _items(payload):
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
if not _namespace_allowed(namespace):
continue
workload, source = _pod_workload(metadata)
if not workload:
continue
node = spec.get("nodeName") if isinstance(spec.get("nodeName"), str) else ""
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
key = (namespace, workload)
entry = workloads.setdefault(
key,
{
"namespace": namespace,
"workload": workload,
"source": source,
"nodes": {},
"pods_total": 0,
"pods_running": 0,
},
)
entry["pods_total"] += 1
if phase == "Running":
entry["pods_running"] += 1
if node:
nodes = entry["nodes"]
nodes[node] = nodes.get(node, 0) + 1
output: list[dict[str, Any]] = []
for entry in workloads.values():
nodes = entry.get("nodes") or {}
primary = ""
if isinstance(nodes, dict) and nodes:
primary = sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[0][0]
entry["primary_node"] = primary
output.append(entry)
output.sort(key=lambda item: (item.get("namespace") or "", item.get("workload") or ""))
return output
def _summarize_namespace_pods(payload: dict[str, Any]) -> list[dict[str, Any]]:
namespaces: dict[str, dict[str, Any]] = {}
for pod in _items(payload):
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
if not _namespace_allowed(namespace):
continue
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
entry = namespaces.setdefault(
namespace,
{
"namespace": namespace,
"pods_total": 0,
"pods_running": 0,
"pods_pending": 0,
"pods_failed": 0,
"pods_succeeded": 0,
},
)
entry["pods_total"] += 1
if phase == "Running":
entry["pods_running"] += 1
elif phase == "Pending":
entry["pods_pending"] += 1
elif phase == "Failed":
entry["pods_failed"] += 1
elif phase == "Succeeded":
entry["pods_succeeded"] += 1
output = list(namespaces.values())
output.sort(key=lambda item: (-item.get("pods_total", 0), item.get("namespace") or ""))
return output
def _summarize_namespace_nodes(payload: dict[str, Any]) -> list[dict[str, Any]]:
namespaces: dict[str, dict[str, Any]] = {}
for pod in _items(payload):
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
if not _namespace_allowed(namespace):
continue
node = spec.get("nodeName") if isinstance(spec.get("nodeName"), str) else ""
if not node:
continue
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
entry = namespaces.setdefault(
namespace,
{
"namespace": namespace,
"pods_total": 0,
"pods_running": 0,
"nodes": {},
},
)
entry["pods_total"] += 1
if phase == "Running":
entry["pods_running"] += 1
nodes = entry["nodes"]
nodes[node] = nodes.get(node, 0) + 1
output: list[dict[str, Any]] = []
for entry in namespaces.values():
nodes = entry.get("nodes") or {}
primary = ""
if isinstance(nodes, dict) and nodes:
primary = sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[0][0]
entry["primary_node"] = primary
output.append(entry)
output.sort(key=lambda item: (-item.get("pods_total", 0), item.get("namespace") or ""))
return output
_NODE_PHASE_KEYS = {
"Running": "pods_running",
"Pending": "pods_pending",
"Failed": "pods_failed",
"Succeeded": "pods_succeeded",
}
def _summarize_node_pods(payload: dict[str, Any]) -> list[dict[str, Any]]:
nodes: dict[str, dict[str, Any]] = {}
for pod in _items(payload):
context = _node_pod_context(pod)
if not context:
continue
node, namespace, phase = context
entry = _node_pod_entry(nodes, node)
_node_pod_apply(entry, namespace, phase)
return _node_pod_finalize(nodes)
def _node_pod_context(pod: dict[str, Any]) -> tuple[str, str, str] | None:
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
if not _namespace_allowed(namespace):
return None
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
node = spec.get("nodeName") if isinstance(spec.get("nodeName"), str) else ""
if not node:
return None
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
return node, namespace, phase
def _node_pod_entry(nodes: dict[str, dict[str, Any]], node: str) -> dict[str, Any]:
return nodes.setdefault(
node,
{
"node": node,
"pods_total": 0,
"pods_running": 0,
"pods_pending": 0,
"pods_failed": 0,
"pods_succeeded": 0,
"namespaces": {},
},
)
def _node_pod_apply(entry: dict[str, Any], namespace: str, phase: str) -> None:
entry["pods_total"] += 1
phase_key = _NODE_PHASE_KEYS.get(phase)
if phase_key:
entry[phase_key] += 1
if namespace:
namespaces = entry["namespaces"]
namespaces[namespace] = namespaces.get(namespace, 0) + 1
def _node_pod_finalize(nodes: dict[str, dict[str, Any]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in nodes.values():
namespaces = entry.get("namespaces") or {}
if isinstance(namespaces, dict):
entry["namespaces_top"] = sorted(
namespaces.items(), key=lambda item: (-item[1], item[0])
)[:3]
output.append(entry)
output.sort(key=lambda item: (-item.get("pods_total", 0), item.get("node") or ""))
return output
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,394 @@
from __future__ import annotations
from .common import *
from .nodes import *
from .k8s import *
from .pods import *
from .vm import *
from .trends import *
def _pvc_usage_trends() -> dict[str, list[dict[str, Any]]]:
trends: dict[str, list[dict[str, Any]]] = {}
expr = "kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100"
for window in _TREND_WINDOWS:
entries = _vm_vector(
f"topk({_TREND_PVC_LIMIT}, max_over_time(({expr})[{window}]))"
)
trends[window] = _pvc_top(entries)
return trends
def _postgres_connections(errors: list[str]) -> dict[str, Any]:
postgres: dict[str, Any] = {}
try:
postgres["used"] = _vm_scalar("sum(pg_stat_activity_count)")
postgres["max"] = _vm_scalar("max(pg_settings_max_connections)")
postgres["by_db"] = _vm_vector(
"topk(5, sum by (datname) (pg_stat_activity_count))"
)
postgres["hottest_db"] = _vm_topk(
"topk(1, sum by (datname) (pg_stat_activity_count))",
"datname",
)
except Exception as exc:
errors.append(f"postgres: {exc}")
return postgres
def _hottest_nodes(errors: list[str]) -> dict[str, Any]:
hottest: dict[str, Any] = {}
try:
hottest["cpu"] = _vm_topk(
f'label_replace(topk(1, avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
f'* on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node",
)
hottest["ram"] = _vm_topk(
f'label_replace(topk(1, avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) '
f'/ node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node",
)
hottest["net"] = _vm_topk(
f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node",
)
hottest["io"] = _vm_topk(
f'label_replace(topk(1, avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
f'* on(instance) group_left(node) label_replace({_NODE_UNAME_LABEL}, "node", "$1", "nodename", "(.*)"))), "__name__", "$1", "node", "(.*)")',
"node",
)
except Exception as exc:
errors.append(f"hottest: {exc}")
return hottest
def _node_usage(errors: list[str]) -> dict[str, Any]:
usage: dict[str, Any] = {}
try:
exprs = _node_usage_exprs()
usage["cpu"] = _vm_node_metric(exprs["cpu"], "node")
usage["ram"] = _vm_node_metric(exprs["ram"], "node")
usage["net"] = _vm_node_metric(exprs["net"], "node")
usage["io"] = _vm_node_metric(exprs["io"], "node")
usage["disk"] = _vm_node_metric(exprs["disk"], "node")
except Exception as exc:
errors.append(f"node_usage: {exc}")
return usage
def _pvc_usage(errors: list[str]) -> list[dict[str, Any]]:
try:
entries = _vm_vector(
"topk(5, max by (namespace,persistentvolumeclaim) "
"(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))"
)
return _filter_namespace_vector(entries)
except Exception as exc:
errors.append(f"pvc_usage: {exc}")
return []
def _usage_stats(series: list[dict[str, Any]]) -> dict[str, float]:
values: list[float] = []
for entry in series:
if not isinstance(entry, dict):
continue
try:
values.append(float(entry.get("value")))
except (TypeError, ValueError):
continue
if not values:
return {}
return {
"min": min(values),
"max": max(values),
"avg": sum(values) / len(values),
}
def _vm_namespace_totals(expr: str) -> dict[str, float]:
totals: dict[str, float] = {}
for item in _vm_vector(expr):
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
if not isinstance(namespace, str) or not namespace:
continue
try:
totals[namespace] = float(item.get("value"))
except (TypeError, ValueError):
continue
return totals
def _build_namespace_capacity(
cpu_usage: dict[str, float],
cpu_requests: dict[str, float],
mem_usage: dict[str, float],
mem_requests: dict[str, float],
) -> list[dict[str, Any]]:
namespaces = sorted(set(cpu_usage) | set(cpu_requests) | set(mem_usage) | set(mem_requests))
output: list[dict[str, Any]] = []
for namespace in namespaces:
cpu_used = cpu_usage.get(namespace)
cpu_req = cpu_requests.get(namespace)
mem_used = mem_usage.get(namespace)
mem_req = mem_requests.get(namespace)
cpu_ratio = None
mem_ratio = None
if isinstance(cpu_used, (int, float)) and isinstance(cpu_req, (int, float)) and cpu_req > 0:
cpu_ratio = cpu_used / cpu_req
if isinstance(mem_used, (int, float)) and isinstance(mem_req, (int, float)) and mem_req > 0:
mem_ratio = mem_used / mem_req
output.append(
{
"namespace": namespace,
"cpu_usage": cpu_used,
"cpu_requests": cpu_req,
"cpu_usage_ratio": cpu_ratio,
"mem_usage": mem_used,
"mem_requests": mem_req,
"mem_usage_ratio": mem_ratio,
}
)
output.sort(
key=lambda item: (
-(item.get("cpu_requests") or 0),
-(item.get("mem_requests") or 0),
item.get("namespace") or "",
)
)
return output
def _node_usage_profile(
node_usage: dict[str, list[dict[str, Any]]],
node_details: list[dict[str, Any]],
node_pods: list[dict[str, Any]],
) -> list[dict[str, Any]]:
usage: dict[str, dict[str, Any]] = {}
for key in ("cpu", "ram", "disk", "net", "io"):
for item in node_usage.get(key, []) or []:
node = item.get("node")
value = item.get("value")
if not isinstance(node, str) or not node:
continue
if not isinstance(value, (int, float)):
continue
usage.setdefault(node, {})[key] = float(value)
max_values: dict[str, float] = {}
for key in ("cpu", "ram", "disk", "net", "io"):
values = [entry.get(key) for entry in usage.values() if isinstance(entry.get(key), (int, float))]
max_values[key] = max(values) if values else 0.0
detail_map: dict[str, dict[str, Any]] = {
entry.get("name"): entry for entry in node_details if isinstance(entry, dict)
}
pod_map: dict[str, dict[str, Any]] = {
entry.get("node"): entry for entry in node_pods if isinstance(entry, dict)
}
output: list[dict[str, Any]] = []
for node, entry in usage.items():
detail = detail_map.get(node, {})
pressure = detail.get("pressure") if isinstance(detail.get("pressure"), dict) else {}
pressure_count = sum(1 for value in pressure.values() if value)
taints = detail.get("taints") if isinstance(detail.get("taints"), list) else []
unschedulable = bool(detail.get("unschedulable"))
pods_total = None
pod_entry = pod_map.get(node)
if isinstance(pod_entry, dict):
pods_total = pod_entry.get("pods_total")
normalized: dict[str, float] = {}
for key in ("cpu", "ram", "disk", "net", "io"):
raw = entry.get(key)
max_val = max_values.get(key) or 0.0
if isinstance(raw, (int, float)) and max_val > 0:
normalized[f"{key}_norm"] = raw / max_val
norm_values = [v for v in normalized.values() if isinstance(v, (int, float))]
load_index = sum(norm_values) / len(norm_values) if norm_values else None
output.append(
{
"node": node,
"cpu": entry.get("cpu"),
"ram": entry.get("ram"),
"disk": entry.get("disk"),
"net": entry.get("net"),
"io": entry.get("io"),
**normalized,
"pressure_flags": pressure,
"pressure_count": pressure_count,
"taints": taints,
"unschedulable": unschedulable,
"pods_total": pods_total,
"load_index": load_index,
}
)
output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("node") or ""))
return output
def _percentile(values: list[float], percentile: float) -> float | None:
if not values:
return None
ordered = sorted(values)
idx = int(round((len(ordered) - 1) * percentile))
idx = min(max(idx, 0), len(ordered) - 1)
return ordered[idx]
def _node_load_summary(node_load: list[dict[str, Any]]) -> dict[str, Any]:
items = [
entry
for entry in node_load
if isinstance(entry, dict) and isinstance(entry.get("load_index"), (int, float))
]
if not items:
return {}
values = [float(entry.get("load_index") or 0) for entry in items]
avg = sum(values) / len(values)
variance = sum((value - avg) ** 2 for value in values) / len(values)
stddev = variance**0.5
top = sorted(items, key=lambda item: -(item.get("load_index") or 0))[:_LOAD_TOP_COUNT]
bottom = sorted(items, key=lambda item: (item.get("load_index") or 0))[:_LOAD_TOP_COUNT]
outliers = [
item
for item in items
if isinstance(item.get("load_index"), (int, float))
and item.get("load_index") >= avg + stddev
]
outliers.sort(key=lambda item: -(item.get("load_index") or 0))
return {
"avg": round(avg, 3),
"p90": round(_percentile(values, 0.9) or 0.0, 3),
"min": round(min(values), 3),
"max": round(max(values), 3),
"top": top,
"bottom": bottom,
"outliers": outliers[:_LOAD_TOP_COUNT],
}
def _namespace_capacity_summary(capacity: list[dict[str, Any]]) -> dict[str, Any]:
if not capacity:
return {}
cpu_ratio = [
entry
for entry in capacity
if isinstance(entry, dict) and isinstance(entry.get("cpu_usage_ratio"), (int, float))
]
mem_ratio = [
entry
for entry in capacity
if isinstance(entry, dict) and isinstance(entry.get("mem_usage_ratio"), (int, float))
]
cpu_ratio.sort(key=lambda item: -(item.get("cpu_usage_ratio") or 0))
mem_ratio.sort(key=lambda item: -(item.get("mem_usage_ratio") or 0))
cpu_headroom: list[dict[str, Any]] = []
mem_headroom: list[dict[str, Any]] = []
for entry in capacity:
if not isinstance(entry, dict):
continue
cpu_used = entry.get("cpu_usage")
cpu_req = entry.get("cpu_requests")
mem_used = entry.get("mem_usage")
mem_req = entry.get("mem_requests")
if isinstance(cpu_used, (int, float)) and isinstance(cpu_req, (int, float)):
cpu_headroom.append(
{
"namespace": entry.get("namespace"),
"headroom": cpu_req - cpu_used,
"usage": cpu_used,
"requests": cpu_req,
"ratio": entry.get("cpu_usage_ratio"),
}
)
if isinstance(mem_used, (int, float)) and isinstance(mem_req, (int, float)):
mem_headroom.append(
{
"namespace": entry.get("namespace"),
"headroom": mem_req - mem_used,
"usage": mem_used,
"requests": mem_req,
"ratio": entry.get("mem_usage_ratio"),
}
)
cpu_headroom.sort(key=lambda item: (item.get("headroom") or 0))
mem_headroom.sort(key=lambda item: (item.get("headroom") or 0))
cpu_over_names = [
entry.get("namespace")
for entry in cpu_ratio
if (entry.get("cpu_usage_ratio") or 0) > 1 and entry.get("namespace")
]
mem_over_names = [
entry.get("namespace")
for entry in mem_ratio
if (entry.get("mem_usage_ratio") or 0) > 1 and entry.get("namespace")
]
over_cpu = len(cpu_over_names)
over_mem = len(mem_over_names)
return {
"cpu_ratio_top": cpu_ratio[:_NAMESPACE_TOP_COUNT],
"mem_ratio_top": mem_ratio[:_NAMESPACE_TOP_COUNT],
"cpu_headroom_low": cpu_headroom[:_NAMESPACE_TOP_COUNT],
"mem_headroom_low": mem_headroom[:_NAMESPACE_TOP_COUNT],
"cpu_overcommitted": over_cpu,
"mem_overcommitted": over_mem,
"cpu_overcommitted_names": sorted({name for name in cpu_over_names if isinstance(name, str)}),
"mem_overcommitted_names": sorted({name for name in mem_over_names if isinstance(name, str)}),
}
def _collect_vm_core(metrics: dict[str, Any], errors: list[str]) -> None:
try:
metrics["nodes_total"] = _vm_scalar("count(kube_node_info)")
metrics["nodes_ready"] = _vm_scalar(
"count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})"
)
metrics["capacity_cpu"] = _vm_scalar("sum(kube_node_status_capacity_cpu_cores)")
metrics["allocatable_cpu"] = _vm_scalar("sum(kube_node_status_allocatable_cpu_cores)")
metrics["capacity_mem_bytes"] = _vm_scalar("sum(kube_node_status_capacity_memory_bytes)")
metrics["allocatable_mem_bytes"] = _vm_scalar("sum(kube_node_status_allocatable_memory_bytes)")
metrics["capacity_pods"] = _vm_scalar("sum(kube_node_status_capacity_pods)")
metrics["allocatable_pods"] = _vm_scalar("sum(kube_node_status_allocatable_pods)")
metrics["pods_running"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Running\"})")
metrics["pods_pending"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Pending\"})")
metrics["pods_failed"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Failed\"})")
metrics["pods_succeeded"] = _vm_scalar("sum(kube_pod_status_phase{phase=\"Succeeded\"})")
metrics["top_restarts_1h"] = _vm_vector(
f"topk(5, sum by (namespace,pod) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
)
metrics["restart_namespace_top"] = _filter_namespace_vector(
_vm_vector(
f"topk(5, sum by (namespace) (increase(kube_pod_container_status_restarts_total[{_RESTARTS_WINDOW}])))"
)
)
metrics["pod_cpu_top"] = _filter_namespace_vector(
_vm_vector(
f'topk(5, sum by (namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))'
)
)
metrics["pod_cpu_top_node"] = _filter_namespace_vector(
_vm_vector(
f'topk(5, sum by (node,namespace,pod) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]) * on (namespace,pod) group_left(node) kube_pod_info))'
)
)
metrics["pod_mem_top"] = _filter_namespace_vector(
_vm_vector(
"topk(5, sum by (namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"}))"
)
)
metrics["pod_mem_top_node"] = _filter_namespace_vector(
_vm_vector(
"topk(5, sum by (node,namespace,pod) (container_memory_working_set_bytes{namespace!=\"\"} * on (namespace,pod) group_left(node) kube_pod_info))"
)
)
metrics["job_failures_24h"] = _vm_vector(
"topk(5, sum by (namespace,job_name) (increase(kube_job_status_failed[24h])))"
)
except Exception as exc:
errors.append(f"vm: {exc}")
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,343 @@
from __future__ import annotations
from .common import *
from .nodes import *
from .k8s import *
from .pods import *
from .vm import *
from .trends import *
from .metrics import *
def _collect_node_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
metrics["postgres_connections"] = _postgres_connections(errors)
metrics["hottest_nodes"] = _hottest_nodes(errors)
metrics["node_usage"] = _node_usage(errors)
metrics["node_usage_stats"] = {
"cpu": _usage_stats(metrics.get("node_usage", {}).get("cpu", [])),
"ram": _usage_stats(metrics.get("node_usage", {}).get("ram", [])),
"net": _usage_stats(metrics.get("node_usage", {}).get("net", [])),
"io": _usage_stats(metrics.get("node_usage", {}).get("io", [])),
"disk": _usage_stats(metrics.get("node_usage", {}).get("disk", [])),
}
try:
node_exprs = _node_usage_exprs()
node_baseline_map: dict[str, dict[str, dict[str, float]]] = {}
for key, expr in node_exprs.items():
baseline = _vm_baseline_map(expr, "node", _BASELINE_WINDOW)
metrics.setdefault("node_baseline", {})[key] = _baseline_map_to_list(baseline, "node")
for name, stats in baseline.items():
node_baseline_map.setdefault(name, {})[key] = stats
metrics["node_baseline_map"] = node_baseline_map
except Exception as exc:
errors.append(f"baseline: {exc}")
def _collect_trend_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
try:
metrics["node_trends"] = _build_metric_trends(
_node_usage_exprs(),
"node",
"node",
_TREND_WINDOWS,
_TREND_NODE_LIMIT,
)
metrics["namespace_trends"] = _build_metric_trends(
_namespace_usage_exprs(),
"namespace",
"namespace",
_TREND_WINDOWS,
_TREND_NAMESPACE_LIMIT,
)
metrics["namespace_request_trends"] = _build_metric_trends(
_namespace_request_exprs(),
"namespace",
"namespace",
_TREND_WINDOWS,
_TREND_NAMESPACE_LIMIT,
)
metrics["restart_trends"] = {
window: _restart_namespace_trend(window) for window in _TREND_WINDOWS
}
metrics["job_failure_trends"] = {
window: _job_failure_trend(window) for window in _TREND_WINDOWS
}
metrics["pods_phase_trends"] = _pods_phase_trends()
metrics["pvc_usage_trends"] = _pvc_usage_trends()
metrics["pod_waiting_now"] = _pod_waiting_now()
metrics["pod_waiting_trends"] = _pod_waiting_trends()
metrics["pod_terminated_now"] = _pod_terminated_now()
metrics["pod_terminated_trends"] = _pod_terminated_trends()
metrics["cluster_trends"] = _cluster_trends()
metrics["node_condition_trends"] = _node_condition_trends()
metrics["pod_reason_totals"] = {
"waiting": _pod_reason_totals(
_POD_WAITING_REASONS,
"kube_pod_container_status_waiting_reason",
),
"terminated": _pod_reason_totals(
_POD_TERMINATED_REASONS,
"kube_pod_container_status_terminated_reason",
),
}
except Exception as exc:
errors.append(f"trends: {exc}")
def _collect_issue_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
try:
waiting_series = "kube_pod_container_status_waiting_reason"
terminated_series = "kube_pod_container_status_terminated_reason"
metrics["namespace_issue_top"] = {
"crash_loop": _namespace_reason_entries(
f'{waiting_series}{{reason="CrashLoopBackOff"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"image_pull": _namespace_reason_entries(
f'{waiting_series}{{reason="ImagePullBackOff"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"err_image_pull": _namespace_reason_entries(
f'{waiting_series}{{reason="ErrImagePull"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"config_error": _namespace_reason_entries(
f'{waiting_series}{{reason="CreateContainerConfigError"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"oom_killed": _namespace_reason_entries(
f'{terminated_series}{{reason="OOMKilled"}}',
_NAMESPACE_ISSUE_LIMIT,
),
"terminated_error": _namespace_reason_entries(
f'{terminated_series}{{reason="Error"}}',
_NAMESPACE_ISSUE_LIMIT,
),
}
except Exception as exc:
errors.append(f"issues: {exc}")
def _collect_alert_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
try:
vm_now = _vm_alerts_now()
vm_trends = {window: _vm_alerts_trend(window) for window in _TREND_WINDOWS}
alertmanager_alerts = _alertmanager_alerts(errors)
metrics["alerts"] = {
"vm": {
"active": vm_now,
"active_total": len(vm_now),
},
"alertmanager": _summarize_alerts(alertmanager_alerts) if alertmanager_alerts else {},
"trends": vm_trends,
}
except Exception as exc:
errors.append(f"alerts: {exc}")
def _collect_namespace_metrics(metrics: dict[str, Any], errors: list[str]) -> None:
try:
metrics["namespace_cpu_top"] = _filter_namespace_vector(
_vm_vector(
f'topk(5, sum by (namespace) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}])))'
)
)
metrics["namespace_mem_top"] = _filter_namespace_vector(
_vm_vector(
"topk(5, sum by (namespace) (container_memory_working_set_bytes{namespace!=\"\"}))"
)
)
metrics["namespace_cpu_requests_top"] = _filter_namespace_vector(
_vm_vector(
"topk(5, sum by (namespace) (kube_pod_container_resource_requests_cpu_cores))"
)
)
metrics["namespace_mem_requests_top"] = _filter_namespace_vector(
_vm_vector(
"topk(5, sum by (namespace) (kube_pod_container_resource_requests_memory_bytes))"
)
)
metrics["namespace_net_top"] = _filter_namespace_vector(
_vm_vector(
f"topk(5, sum by (namespace) (rate(container_network_receive_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}]) + rate(container_network_transmit_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}])))"
)
)
metrics["namespace_io_top"] = _filter_namespace_vector(
_vm_vector(
f"topk(5, sum by (namespace) (rate(container_fs_reads_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}]) + rate(container_fs_writes_bytes_total{{namespace!=\"\"}}[{_RATE_WINDOW}])))"
)
)
namespace_cpu_usage = _vm_namespace_totals(
f'sum by (namespace) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]))'
)
namespace_cpu_requests = _vm_namespace_totals(
"sum by (namespace) (kube_pod_container_resource_requests_cpu_cores)"
)
namespace_mem_usage = _vm_namespace_totals(
'sum by (namespace) (container_memory_working_set_bytes{namespace!=""})'
)
namespace_mem_requests = _vm_namespace_totals(
"sum by (namespace) (kube_pod_container_resource_requests_memory_bytes)"
)
metrics["namespace_capacity"] = _build_namespace_capacity(
namespace_cpu_usage,
namespace_cpu_requests,
namespace_mem_usage,
namespace_mem_requests,
)
metrics["namespace_totals"] = {
"cpu": _namespace_totals_list(namespace_cpu_usage),
"mem": _namespace_totals_list(namespace_mem_usage),
"cpu_requests": _namespace_totals_list(namespace_cpu_requests),
"mem_requests": _namespace_totals_list(namespace_mem_requests),
}
except Exception as exc:
errors.append(f"namespace_usage: {exc}")
try:
namespace_exprs = _namespace_usage_exprs()
namespace_baseline_map: dict[str, dict[str, dict[str, float]]] = {}
for key, expr in namespace_exprs.items():
baseline = _vm_baseline_map(expr, "namespace", _BASELINE_WINDOW)
metrics.setdefault("namespace_baseline", {})[key] = _baseline_map_to_list(baseline, "namespace")
for name, stats in baseline.items():
namespace_baseline_map.setdefault(name, {})[key] = stats
metrics["namespace_baseline_map"] = namespace_baseline_map
except Exception as exc:
errors.append(f"baseline: {exc}")
metrics["namespace_capacity_summary"] = _namespace_capacity_summary(
metrics.get("namespace_capacity", []),
)
def _finalize_metrics(metrics: dict[str, Any]) -> None:
metrics["units"] = {
"cpu": "percent",
"ram": "percent",
"net": "bytes_per_sec",
"io": "bytes_per_sec",
"disk": "percent",
"restarts": "count",
"pod_cpu": "cores",
"pod_mem": "bytes",
"pod_cpu_top_node": "cores",
"pod_mem_top_node": "bytes",
"job_failures_24h": "count",
"namespace_cpu": "cores",
"namespace_mem": "bytes",
"namespace_cpu_requests": "cores",
"namespace_mem_requests": "bytes",
"namespace_net": "bytes_per_sec",
"namespace_io": "bytes_per_sec",
"pvc_used_percent": "percent",
"capacity_cpu": "cores",
"allocatable_cpu": "cores",
"capacity_mem_bytes": "bytes",
"allocatable_mem_bytes": "bytes",
"capacity_pods": "count",
"allocatable_pods": "count",
}
metrics["windows"] = {
"rates": _RATE_WINDOW,
"restarts": _RESTARTS_WINDOW,
"trend": _TREND_WINDOWS,
}
def _summarize_metrics(errors: list[str]) -> dict[str, Any]:
metrics: dict[str, Any] = {}
_collect_vm_core(metrics, errors)
_collect_node_metrics(metrics, errors)
_collect_trend_metrics(metrics, errors)
_collect_alert_metrics(metrics, errors)
_collect_namespace_metrics(metrics, errors)
_collect_issue_metrics(metrics, errors)
metrics["pvc_usage_top"] = _pvc_usage(errors)
metrics["trend_summary"] = _trend_summary(metrics)
_finalize_metrics(metrics)
return metrics
def _trend_summary(metrics: dict[str, Any]) -> dict[str, Any]:
node_trends = metrics.get("node_trends", {}) if isinstance(metrics.get("node_trends"), dict) else {}
namespace_trends = (
metrics.get("namespace_trends", {}) if isinstance(metrics.get("namespace_trends"), dict) else {}
)
restarts = metrics.get("restart_trends", {}) if isinstance(metrics.get("restart_trends"), dict) else {}
job_failures = (
metrics.get("job_failure_trends", {}) if isinstance(metrics.get("job_failure_trends"), dict) else {}
)
summary: dict[str, Any] = {}
for metric_key, target in (("cpu", "node_cpu"), ("ram", "node_ram")):
metric_block = node_trends.get(metric_key, {}) if isinstance(node_trends.get(metric_key), dict) else {}
summary[target] = {
window: _limit_entries((metric_block.get(window) or {}).get("avg", []), 5)
for window in _TREND_WINDOWS
}
for metric_key, target in (("cpu", "namespace_cpu"), ("mem", "namespace_mem")):
metric_block = namespace_trends.get(metric_key, {}) if isinstance(namespace_trends.get(metric_key), dict) else {}
summary[target] = {
window: _limit_entries((metric_block.get(window) or {}).get("avg", []), 5)
for window in _TREND_WINDOWS
}
summary["restarts"] = {window: _limit_entries(entries or [], 5) for window, entries in restarts.items()}
summary["job_failures"] = {
window: _limit_entries(entries or [], 5) for window, entries in job_failures.items()
}
return summary
def _build_offenders(metrics: dict[str, Any]) -> dict[str, Any]:
offenders: dict[str, Any] = {}
offenders["pod_restarts_1h"] = _pod_restarts_top(metrics)
offenders["pod_waiting_now"] = metrics.get("pod_waiting_now") or {}
offenders["pod_terminated_now"] = metrics.get("pod_terminated_now") or {}
offenders["job_failures_24h"] = metrics.get("job_failures_24h") or []
offenders["pvc_pressure"] = _pvc_pressure_entries(metrics)
offenders["namespace_issues"] = metrics.get("namespace_issue_top") or {}
return offenders
def _namespace_totals_list(totals: dict[str, float]) -> list[dict[str, Any]]:
entries = [
{"namespace": name, "value": value}
for name, value in totals.items()
if isinstance(name, str) and name
]
entries.sort(key=lambda item: (-(item.get("value") or 0), item.get("namespace") or ""))
return entries
def _vector_to_named(entries: list[dict[str, Any]], label_key: str, name_key: str) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in entries:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
value = item.get("value")
label = metric.get(label_key) if isinstance(metric, dict) else None
if not isinstance(label, str) or not label:
continue
output.append({name_key: label, "value": value, "metric": metric})
output.sort(key=lambda item: (-(item.get("value") or 0), item.get(name_key) or ""))
return output
def _pvc_top(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in entries:
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
pvc = metric.get("persistentvolumeclaim")
if not isinstance(namespace, str) or not isinstance(pvc, str):
continue
output.append(
{
"namespace": namespace,
"pvc": pvc,
"used_percent": item.get("value"),
}
)
output.sort(key=lambda item: (-(item.get("used_percent") or 0), item.get("namespace") or ""))
return output
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,90 @@
from __future__ import annotations
from .common import *
def _node_capacity(raw: Any) -> dict[str, str]:
if not isinstance(raw, dict):
return {}
output: dict[str, str] = {}
for key in _CAPACITY_KEYS:
value = raw.get(key)
if isinstance(value, (str, int, float)) and value != "":
output[key] = str(value)
return output
def _node_pressure_conditions(conditions: Any) -> dict[str, bool]:
if not isinstance(conditions, list):
return {}
pressure: dict[str, bool] = {}
for condition in conditions:
if not isinstance(condition, dict):
continue
cond_type = condition.get("type")
if cond_type in _PRESSURE_TYPES:
pressure[cond_type] = condition.get("status") == "True"
return pressure
def _node_roles(labels: dict[str, Any]) -> list[str]:
roles: list[str] = []
for key in labels.keys():
if key.startswith("node-role.kubernetes.io/"):
role = key.split("/", 1)[-1]
if role:
roles.append(role)
return sorted(set(roles))
def _node_is_worker(labels: dict[str, Any]) -> bool:
if "node-role.kubernetes.io/control-plane" in labels:
return False
if "node-role.kubernetes.io/master" in labels:
return False
if "node-role.kubernetes.io/worker" in labels:
return True
return True
def _hardware_hint(labels: dict[str, Any], node_info: dict[str, Any]) -> str:
result = "unknown"
if str(labels.get("jetson") or "").lower() == "true":
result = "jetson"
else:
hardware = (labels.get("hardware") or "").strip().lower()
if hardware:
result = hardware
else:
kernel = str(node_info.get("kernelVersion") or "").lower()
os_image = str(node_info.get("osImage") or "").lower()
if "tegra" in kernel or "jetson" in os_image:
result = "jetson"
elif "raspi" in kernel or "bcm2711" in kernel:
result = "rpi"
else:
arch = str(node_info.get("architecture") or "").lower()
if arch == "amd64":
result = "amd64"
elif arch == "arm64":
result = "arm64-unknown"
return result
def _condition_status(conditions: Any, cond_type: str) -> tuple[bool | None, str, str]:
if not isinstance(conditions, list):
return None, "", ""
for condition in conditions:
if not isinstance(condition, dict):
continue
if condition.get("type") != cond_type:
continue
status = condition.get("status")
if status == "True":
return True, condition.get("reason") or "", condition.get("message") or ""
if status == "False":
return False, condition.get("reason") or "", condition.get("message") or ""
return None, condition.get("reason") or "", condition.get("message") or ""
return None, "", ""
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,319 @@
from __future__ import annotations
from .common import *
from .nodes import *
from .k8s import *
def _node_pods_top(node_pods: list[dict[str, Any]], limit: int = 5) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in node_pods[:limit]:
if not isinstance(entry, dict):
continue
output.append(
{
"node": entry.get("node"),
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"namespaces_top": entry.get("namespaces_top") or [],
}
)
return output
def _record_pending_pod(
pending_oldest: list[dict[str, Any]],
info: dict[str, Any],
) -> bool:
age_hours = info.get("age_hours")
if age_hours is None:
return False
pending_oldest.append(info)
return age_hours >= _PENDING_15M_HOURS
def _update_pod_issue(
pod: dict[str, Any],
acc: dict[str, Any],
) -> None:
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
status = pod.get("status") if isinstance(pod.get("status"), dict) else {}
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
created_at = (
metadata.get("creationTimestamp")
if isinstance(metadata.get("creationTimestamp"), str)
else ""
)
age_hours = _age_hours(created_at)
if not name or not namespace:
return
phase = status.get("phase") if isinstance(status.get("phase"), str) else ""
restarts = 0
waiting_reasons: list[str] = []
for container in status.get("containerStatuses") or []:
if not isinstance(container, dict):
continue
restarts += int(container.get("restartCount") or 0)
state = container.get("state") if isinstance(container.get("state"), dict) else {}
waiting = state.get("waiting") if isinstance(state.get("waiting"), dict) else {}
reason = waiting.get("reason")
if isinstance(reason, str) and reason:
waiting_reasons.append(reason)
acc["waiting_reasons"][reason] = acc["waiting_reasons"].get(reason, 0) + 1
phase_reason = status.get("reason")
if isinstance(phase_reason, str) and phase_reason:
acc["phase_reasons"][phase_reason] = acc["phase_reasons"].get(phase_reason, 0) + 1
if phase in acc["counts"]:
acc["counts"][phase] += 1
if phase in _PHASE_SEVERITY or restarts > 0:
acc["items"].append(
{
"namespace": namespace,
"pod": name,
"node": spec.get("nodeName") or "",
"phase": phase,
"reason": status.get("reason") or "",
"restarts": restarts,
"waiting_reasons": sorted(set(waiting_reasons)),
"created_at": created_at,
"age_hours": age_hours,
}
)
if phase == "Pending":
info = {
"namespace": namespace,
"pod": name,
"node": spec.get("nodeName") or "",
"age_hours": age_hours,
"reason": status.get("reason") or "",
}
if _record_pending_pod(acc["pending_oldest"], info):
acc["pending_over_15m"] += 1
def _summarize_pod_issues(payload: dict[str, Any]) -> dict[str, Any]:
acc = {
"items": [],
"counts": {key: 0 for key in _PHASE_SEVERITY},
"pending_oldest": [],
"pending_over_15m": 0,
"waiting_reasons": {},
"phase_reasons": {},
}
for pod in _items(payload):
if isinstance(pod, dict):
_update_pod_issue(pod, acc)
items = acc["items"]
items.sort(
key=lambda item: (
-_PHASE_SEVERITY.get(item.get("phase") or "", 0),
-(item.get("restarts") or 0),
item.get("namespace") or "",
item.get("pod") or "",
)
)
pending_oldest = acc["pending_oldest"]
pending_oldest.sort(key=lambda item: -(item.get("age_hours") or 0.0))
return {
"counts": acc["counts"],
"items": items[:20],
"pending_oldest": pending_oldest[:10],
"pending_over_15m": acc["pending_over_15m"],
"waiting_reasons": acc["waiting_reasons"],
"phase_reasons": acc["phase_reasons"],
}
def _summarize_jobs(payload: dict[str, Any]) -> dict[str, Any]:
totals = {"total": 0, "active": 0, "failed": 0, "succeeded": 0}
by_namespace: dict[str, dict[str, int]] = {}
failing: list[dict[str, Any]] = []
active_oldest: list[dict[str, Any]] = []
for job in _items(payload):
metadata = job.get("metadata") if isinstance(job.get("metadata"), dict) else {}
status = job.get("status") if isinstance(job.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
created_at = (
metadata.get("creationTimestamp")
if isinstance(metadata.get("creationTimestamp"), str)
else ""
)
if not name or not namespace:
continue
active = int(status.get("active") or 0)
failed = int(status.get("failed") or 0)
succeeded = int(status.get("succeeded") or 0)
totals["total"] += 1
totals["active"] += active
totals["failed"] += failed
totals["succeeded"] += succeeded
entry = by_namespace.setdefault(namespace, {"active": 0, "failed": 0, "succeeded": 0})
entry["active"] += active
entry["failed"] += failed
entry["succeeded"] += succeeded
age_hours = _age_hours(created_at)
if failed > 0:
failing.append(
{
"namespace": namespace,
"job": name,
"failed": failed,
"age_hours": age_hours,
}
)
if active > 0 and age_hours is not None:
active_oldest.append(
{
"namespace": namespace,
"job": name,
"active": active,
"age_hours": age_hours,
}
)
failing.sort(
key=lambda item: (
-(item.get("failed") or 0),
-(item.get("age_hours") or 0.0),
item.get("namespace") or "",
item.get("job") or "",
)
)
active_oldest.sort(key=lambda item: -(item.get("age_hours") or 0.0))
namespace_summary = [
{
"namespace": ns,
"active": stats.get("active", 0),
"failed": stats.get("failed", 0),
"succeeded": stats.get("succeeded", 0),
}
for ns, stats in by_namespace.items()
]
namespace_summary.sort(
key=lambda item: (
-(item.get("active") or 0),
-(item.get("failed") or 0),
item.get("namespace") or "",
)
)
return {
"totals": totals,
"by_namespace": namespace_summary[:20],
"failing": failing[:20],
"active_oldest": active_oldest[:20],
}
def _summarize_deployments(payload: dict[str, Any]) -> dict[str, Any]:
items = _items(payload)
unhealthy: list[dict[str, Any]] = []
for dep in items:
metadata = dep.get("metadata") if isinstance(dep.get("metadata"), dict) else {}
spec = dep.get("spec") if isinstance(dep.get("spec"), dict) else {}
status = dep.get("status") if isinstance(dep.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
desired = int(spec.get("replicas") or 0)
ready = int(status.get("readyReplicas") or 0)
available = int(status.get("availableReplicas") or 0)
updated = int(status.get("updatedReplicas") or 0)
if desired <= 0:
continue
if ready < desired or available < desired:
unhealthy.append(
{
"name": name,
"namespace": namespace,
"desired": desired,
"ready": ready,
"available": available,
"updated": updated,
}
)
unhealthy.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
return {
"total": len(items),
"not_ready": len(unhealthy),
"items": unhealthy,
}
def _summarize_statefulsets(payload: dict[str, Any]) -> dict[str, Any]:
items = _items(payload)
unhealthy: list[dict[str, Any]] = []
for st in items:
metadata = st.get("metadata") if isinstance(st.get("metadata"), dict) else {}
spec = st.get("spec") if isinstance(st.get("spec"), dict) else {}
status = st.get("status") if isinstance(st.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
desired = int(spec.get("replicas") or 0)
ready = int(status.get("readyReplicas") or 0)
current = int(status.get("currentReplicas") or 0)
updated = int(status.get("updatedReplicas") or 0)
if desired <= 0:
continue
if ready < desired:
unhealthy.append(
{
"name": name,
"namespace": namespace,
"desired": desired,
"ready": ready,
"current": current,
"updated": updated,
}
)
unhealthy.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
return {
"total": len(items),
"not_ready": len(unhealthy),
"items": unhealthy,
}
def _summarize_daemonsets(payload: dict[str, Any]) -> dict[str, Any]:
items = _items(payload)
unhealthy: list[dict[str, Any]] = []
for ds in items:
metadata = ds.get("metadata") if isinstance(ds.get("metadata"), dict) else {}
status = ds.get("status") if isinstance(ds.get("status"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
namespace = metadata.get("namespace") if isinstance(metadata.get("namespace"), str) else ""
desired = int(status.get("desiredNumberScheduled") or 0)
ready = int(status.get("numberReady") or 0)
updated = int(status.get("updatedNumberScheduled") or 0)
if desired <= 0:
continue
if ready < desired:
unhealthy.append(
{
"name": name,
"namespace": namespace,
"desired": desired,
"ready": ready,
"updated": updated,
}
)
unhealthy.sort(key=lambda item: (item.get("namespace") or "", item.get("name") or ""))
return {
"total": len(items),
"not_ready": len(unhealthy),
"items": unhealthy,
}
def _summarize_workload_health(
deployments: dict[str, Any],
statefulsets: dict[str, Any],
daemonsets: dict[str, Any],
) -> dict[str, Any]:
return {
"deployments": deployments,
"statefulsets": statefulsets,
"daemonsets": daemonsets,
}
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,391 @@
from __future__ import annotations
from .common import *
from .nodes import *
from .k8s import *
from .pods import *
from .vm import *
from .trends import *
from .metrics import *
from .metrics_extra import *
from .context import *
def _node_context(
node_details: list[dict[str, Any]],
node_load: list[dict[str, Any]],
node_baseline: dict[str, dict[str, dict[str, float]]],
node_workloads: dict[str, dict[str, int]],
) -> list[dict[str, Any]]:
load_map = {entry.get("node"): entry for entry in node_load if isinstance(entry, dict)}
output: list[dict[str, Any]] = []
for entry in node_details:
if not isinstance(entry, dict):
continue
name = entry.get("name")
if not isinstance(name, str) or not name:
continue
load_entry = load_map.get(name, {})
baseline = node_baseline.get(name, {}) if isinstance(node_baseline, dict) else {}
deltas: dict[str, float] = {}
for key in ("cpu", "ram", "net", "io", "disk"):
current = load_entry.get(key)
stats = baseline.get(key, {}) if isinstance(baseline, dict) else {}
delta = _baseline_delta(current, stats)
if delta is not None:
deltas[key] = delta
workloads = node_workloads.get(name, {}) if isinstance(node_workloads, dict) else {}
workloads_top = sorted(workloads.items(), key=lambda item: (-item[1], item[0]))[:_NODE_WORKLOAD_TOP]
output.append(
{
"node": name,
"ready": entry.get("ready"),
"roles": entry.get("roles"),
"is_worker": entry.get("is_worker"),
"hardware": entry.get("hardware"),
"arch": entry.get("arch"),
"os": entry.get("os"),
"taints": entry.get("taints"),
"unschedulable": entry.get("unschedulable"),
"pressure_flags": entry.get("pressure"),
"pods_total": load_entry.get("pods_total"),
"cpu": load_entry.get("cpu"),
"ram": load_entry.get("ram"),
"disk": load_entry.get("disk"),
"net": load_entry.get("net"),
"io": load_entry.get("io"),
"load_index": load_entry.get("load_index"),
"baseline": baseline,
"baseline_delta": deltas,
"workloads_top": workloads_top,
}
)
output.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("node") or ""))
return output
def _baseline_delta(current: Any, stats: dict[str, Any]) -> float | None:
if not isinstance(current, (int, float)):
return None
avg = stats.get("avg")
if not isinstance(avg, (int, float)) or avg == 0:
return None
return round(((float(current) - float(avg)) / float(avg)) * 100, 2)
def _delta_severity(delta: float) -> str:
magnitude = abs(delta)
if magnitude >= _BASELINE_DELTA_CRIT:
return "critical"
if magnitude >= _BASELINE_DELTA_WARN:
return "warning"
return "info"
def _delta_entry_label(entry: dict[str, Any]) -> tuple[str, str]:
if "node" in entry:
return ("node", str(entry.get("node") or ""))
return ("namespace", str(entry.get("namespace") or ""))
def _delta_top(entries: list[dict[str, Any]], key: str, limit: int = _DELTA_TOP_LIMIT) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for entry in entries:
if not isinstance(entry, dict):
continue
deltas = entry.get("baseline_delta") if isinstance(entry.get("baseline_delta"), dict) else {}
delta = deltas.get(key)
if not isinstance(delta, (int, float)):
continue
label_key, label_value = _delta_entry_label(entry)
output.append(
{
label_key: label_value,
"metric": key,
"delta": delta,
"severity": _delta_severity(float(delta)),
}
)
output.sort(key=lambda item: (-(abs(item.get("delta") or 0)), item.get("metric") or ""))
return output[:limit]
def _reason_top(counts: dict[str, Any], limit: int = _REASON_TOP_LIMIT) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for reason, value in counts.items() if isinstance(counts, dict) else []:
if isinstance(reason, str) and reason and isinstance(value, (int, float)):
output.append({"reason": reason, "count": int(value)})
output.sort(key=lambda item: (-item.get("count", 0), item.get("reason") or ""))
return output[:limit]
def _pod_issue_summary(pod_issues: dict[str, Any], metrics: dict[str, Any]) -> dict[str, Any]:
waiting = pod_issues.get("waiting_reasons") if isinstance(pod_issues, dict) else {}
phase = pod_issues.get("phase_reasons") if isinstance(pod_issues, dict) else {}
return {
"waiting_reasons_top": _reason_top(waiting),
"phase_reasons_top": _reason_top(phase),
"namespace_issue_top": metrics.get("namespace_issue_top") or {},
}
def _delta_hit(delta: Any) -> bool:
if not isinstance(delta, (int, float)):
return False
return abs(float(delta)) >= _BASELINE_DELTA_WARN
def _node_delta_signals(node_context: list[dict[str, Any]]) -> list[dict[str, Any]]:
signals: list[dict[str, Any]] = []
for entry in node_context:
if not isinstance(entry, dict):
continue
node = entry.get("node")
deltas = entry.get("baseline_delta") if isinstance(entry.get("baseline_delta"), dict) else {}
baseline = entry.get("baseline") if isinstance(entry.get("baseline"), dict) else {}
if not isinstance(node, str) or not node:
continue
for metric in ("cpu", "ram", "net", "io", "disk"):
delta = deltas.get(metric)
if not _delta_hit(delta):
continue
avg = baseline.get(metric, {}).get("avg") if isinstance(baseline.get(metric), dict) else None
signals.append(
{
"scope": "node",
"target": node,
"metric": metric,
"current": entry.get(metric),
"baseline_avg": avg,
"delta_pct": delta,
"severity": _delta_severity(float(delta)),
}
)
return signals
def _namespace_delta_signals(namespace_context: list[dict[str, Any]]) -> list[dict[str, Any]]:
signals: list[dict[str, Any]] = []
for entry in namespace_context:
if not isinstance(entry, dict):
continue
namespace = entry.get("namespace")
deltas = entry.get("baseline_delta") if isinstance(entry.get("baseline_delta"), dict) else {}
baseline = entry.get("baseline") if isinstance(entry.get("baseline"), dict) else {}
if not isinstance(namespace, str) or not namespace:
continue
for metric, current_key in (("cpu", "cpu_usage"), ("mem", "mem_usage")):
delta = deltas.get(metric)
if not _delta_hit(delta):
continue
avg = baseline.get(metric, {}).get("avg") if isinstance(baseline.get(metric), dict) else None
signals.append(
{
"scope": "namespace",
"target": namespace,
"metric": metric,
"current": entry.get(current_key),
"baseline_avg": avg,
"delta_pct": delta,
"severity": _delta_severity(float(delta)),
}
)
return signals
def _kustomization_signals(kustomizations: dict[str, Any]) -> list[dict[str, Any]]:
count = int(kustomizations.get("not_ready") or 0) if isinstance(kustomizations, dict) else 0
if count <= 0:
return []
return [
{
"scope": "flux",
"target": "kustomizations",
"metric": "not_ready",
"current": count,
"severity": "warning",
}
]
def _pod_issue_signals(pod_issues: dict[str, Any]) -> list[dict[str, Any]]:
if not isinstance(pod_issues, dict):
return []
signals: list[dict[str, Any]] = []
pending_over = int(pod_issues.get("pending_over_15m") or 0)
if pending_over > 0:
signals.append(
{
"scope": "pods",
"target": "pending_over_15m",
"metric": "count",
"current": pending_over,
"severity": "warning",
}
)
counts = pod_issues.get("counts") if isinstance(pod_issues.get("counts"), dict) else {}
failed = int(counts.get("Failed") or 0) if isinstance(counts, dict) else 0
if failed > 0:
signals.append(
{
"scope": "pods",
"target": "failed",
"metric": "count",
"current": failed,
"severity": "critical",
}
)
return signals
def _workload_health_signals(workloads_health: dict[str, Any]) -> list[dict[str, Any]]:
not_ready = _workload_not_ready_items(workloads_health)
if not not_ready:
return []
output: list[dict[str, Any]] = []
for entry in not_ready[:5]:
output.append(
{
"scope": "workload",
"target": f"{entry.get('namespace')}/{entry.get('workload')}",
"metric": "not_ready",
"current": entry.get("ready") or 0,
"desired": entry.get("desired") or 0,
"severity": "warning",
}
)
return output
def _severity_rank(value: Any) -> int:
if value == "critical":
return 0
if value == "warning":
return 1
return 2
def _pvc_pressure_signals(metrics: dict[str, Any]) -> list[dict[str, Any]]:
pvc_top = _pvc_top(metrics.get("pvc_usage_top", []))
if not pvc_top:
return []
output: list[dict[str, Any]] = []
for entry in pvc_top:
used = entry.get("used_percent")
if not isinstance(used, (int, float)) or used < _PVC_PRESSURE_THRESHOLD:
continue
output.append(
{
"scope": "pvc",
"target": f"{entry.get('namespace')}/{entry.get('pvc')}",
"metric": "used_percent",
"current": used,
"severity": "warning" if used < _PVC_CRITICAL_THRESHOLD else "critical",
}
)
return output
def _build_signals(context: SignalContext) -> list[dict[str, Any]]:
signals = (
_node_delta_signals(context.node_context)
+ _namespace_delta_signals(context.namespace_context)
+ _workload_health_signals(context.workloads_health)
+ _pod_issue_signals(context.pod_issues)
+ _kustomization_signals(context.kustomizations)
+ _pvc_pressure_signals(context.metrics)
)
signals.sort(key=lambda item: (_severity_rank(item.get("severity")), item.get("scope") or ""))
return signals[:_SIGNAL_LIMIT]
def _node_profiles(
node_context: list[dict[str, Any]],
node_pods: list[dict[str, Any]],
node_workloads: dict[str, dict[str, int]],
) -> list[dict[str, Any]]:
pod_map = {entry.get("node"): entry for entry in node_pods if isinstance(entry, dict)}
workload_map = node_workloads or {}
profiles: list[dict[str, Any]] = []
for entry in node_context:
if not isinstance(entry, dict):
continue
node = entry.get("node")
if not isinstance(node, str) or not node:
continue
pods = pod_map.get(node, {})
workloads = workload_map.get(node, {})
workloads_top = sorted(workloads.items(), key=lambda item: (-item[1], item[0]))[:_NODE_WORKLOAD_TOP]
profiles.append(
{
"node": node,
"ready": entry.get("ready"),
"hardware": entry.get("hardware"),
"arch": entry.get("arch"),
"roles": entry.get("roles"),
"pods_total": pods.get("pods_total"),
"pods_running": pods.get("pods_running"),
"namespaces_top": pods.get("namespaces_top") or [],
"workloads_top": workloads_top,
"load_index": entry.get("load_index"),
"cpu": entry.get("cpu"),
"ram": entry.get("ram"),
"net": entry.get("net"),
"io": entry.get("io"),
"disk": entry.get("disk"),
"baseline_delta": entry.get("baseline_delta") or {},
}
)
profiles.sort(key=lambda item: (-(item.get("load_index") or 0), item.get("node") or ""))
return profiles[:_PROFILE_LIMIT]
def _namespace_profiles(namespace_context: list[dict[str, Any]]) -> list[dict[str, Any]]:
entries = [entry for entry in namespace_context if isinstance(entry, dict)]
entries.sort(key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or ""))
output: list[dict[str, Any]] = []
for entry in entries[:_PROFILE_LIMIT]:
output.append(
{
"namespace": entry.get("namespace"),
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"primary_node": entry.get("primary_node"),
"nodes_top": entry.get("nodes_top") or [],
"cpu_usage": entry.get("cpu_usage"),
"mem_usage": entry.get("mem_usage"),
"cpu_ratio": entry.get("cpu_ratio"),
"mem_ratio": entry.get("mem_ratio"),
"baseline_delta": entry.get("baseline_delta") or {},
}
)
return output
def _workload_profiles(workloads: list[dict[str, Any]]) -> list[dict[str, Any]]:
entries = [entry for entry in workloads if isinstance(entry, dict)]
entries.sort(
key=lambda item: (-(item.get("pods_total") or 0), item.get("namespace") or "", item.get("workload") or ""),
)
output: list[dict[str, Any]] = []
for entry in entries[:_PROFILE_LIMIT]:
nodes = entry.get("nodes") if isinstance(entry.get("nodes"), dict) else {}
nodes_top = (
sorted(nodes.items(), key=lambda item: (-item[1], item[0]))[:3]
if isinstance(nodes, dict)
else []
)
output.append(
{
"namespace": entry.get("namespace"),
"workload": entry.get("workload"),
"source": entry.get("source"),
"pods_total": entry.get("pods_total"),
"pods_running": entry.get("pods_running"),
"primary_node": entry.get("primary_node"),
"nodes_top": nodes_top,
}
)
return output
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,329 @@
from __future__ import annotations
from .common import *
from .nodes import *
from .k8s import *
from .pods import *
from .vm import *
def _vm_node_metric(expr: str, label_key: str) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in _vm_vector(expr):
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
label = metric.get(label_key)
value = item.get("value")
if isinstance(label, str) and label:
output.append({"node": label, "value": value})
output.sort(key=lambda item: item.get("node") or "")
return output
def _vm_baseline_map(expr: str, label_key: str, window: str) -> dict[str, dict[str, float]]:
averages = _vm_vector(f"avg_over_time(({expr})[{window}])")
maximums = _vm_vector(f"max_over_time(({expr})[{window}])")
baseline: dict[str, dict[str, float]] = {}
for item in averages:
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
label = metric.get(label_key)
if not isinstance(label, str) or not label:
continue
baseline.setdefault(label, {})["avg"] = float(item.get("value") or 0)
for item in maximums:
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
label = metric.get(label_key)
if not isinstance(label, str) or not label:
continue
baseline.setdefault(label, {})["max"] = float(item.get("value") or 0)
return baseline
def _baseline_map_to_list(
baseline: dict[str, dict[str, float]],
name_key: str,
) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for name, stats in baseline.items():
if not isinstance(name, str) or not name:
continue
output.append(
{
name_key: name,
"avg": stats.get("avg"),
"max": stats.get("max"),
}
)
output.sort(key=lambda item: (-(item.get("avg") or 0), item.get(name_key) or ""))
return output
def _limit_entries(entries: list[dict[str, Any]], limit: int) -> list[dict[str, Any]]:
if limit <= 0:
return []
return entries[:limit]
def _vm_window_series(
expr: str,
label_key: str,
name_key: str,
window: str,
) -> dict[str, list[dict[str, Any]]]:
avg = _vector_to_named(
_vm_vector(f"avg_over_time(({expr})[{window}])"),
label_key,
name_key,
)
max_values = _vector_to_named(
_vm_vector(f"max_over_time(({expr})[{window}])"),
label_key,
name_key,
)
p95 = _vector_to_named(
_vm_vector(f"quantile_over_time(0.95, ({expr})[{window}])"),
label_key,
name_key,
)
return {"avg": avg, "max": max_values, "p95": p95}
def _trim_window_series(series: dict[str, list[dict[str, Any]]], limit: int) -> dict[str, list[dict[str, Any]]]:
return {key: _limit_entries(entries, limit) for key, entries in series.items()}
def _build_metric_trends(
exprs: dict[str, str],
label_key: str,
name_key: str,
windows: tuple[str, ...],
limit: int,
) -> dict[str, dict[str, dict[str, list[dict[str, Any]]]]]:
trends: dict[str, dict[str, dict[str, list[dict[str, Any]]]]] = {}
for metric, expr in exprs.items():
metric_trends: dict[str, dict[str, list[dict[str, Any]]]] = {}
for window in windows:
series = _vm_window_series(expr, label_key, name_key, window)
metric_trends[window] = _trim_window_series(series, limit)
trends[metric] = metric_trends
return trends
def _vm_scalar_window(expr: str, window: str, fn: str) -> float | None:
return _vm_scalar(f"{fn}(({expr})[{window}])")
def _scalar_trends(expr: str, windows: tuple[str, ...]) -> dict[str, dict[str, float | None]]:
return {
window: {
"avg": _vm_scalar_window(expr, window, "avg_over_time"),
"min": _vm_scalar_window(expr, window, "min_over_time"),
"max": _vm_scalar_window(expr, window, "max_over_time"),
}
for window in windows
}
def _cluster_trends() -> dict[str, dict[str, dict[str, float | None]]]:
exprs = {
"nodes_ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})',
"nodes_not_ready": 'sum(kube_node_status_condition{condition="Ready",status="false"})',
"pods_running": 'sum(kube_pod_status_phase{phase="Running"})',
"pods_pending": 'sum(kube_pod_status_phase{phase="Pending"})',
"pods_failed": 'sum(kube_pod_status_phase{phase="Failed"})',
"pods_succeeded": 'sum(kube_pod_status_phase{phase="Succeeded"})',
"alerts_firing": 'sum(ALERTS{alertstate="firing"})',
"cpu_usage": f'sum(rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]))',
"mem_usage": 'sum(container_memory_working_set_bytes{namespace!=""})',
"net_io": (
f'sum(rate(container_network_receive_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]) '
f'+ rate(container_network_transmit_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]))'
),
"fs_io": (
f'sum(rate(container_fs_reads_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]) '
f'+ rate(container_fs_writes_bytes_total{{namespace!=""}}[{_RATE_WINDOW}]))'
),
}
return {key: _scalar_trends(expr, _TREND_WINDOWS) for key, expr in exprs.items()}
def _node_condition_trends() -> dict[str, dict[str, dict[str, float | None]]]:
conditions = {
"ready": 'sum(kube_node_status_condition{condition="Ready",status="true"})',
"not_ready": 'sum(kube_node_status_condition{condition="Ready",status="false"})',
"unschedulable": "sum(kube_node_spec_unschedulable)",
}
for cond in _PRESSURE_TYPES:
conditions[cond.lower()] = (
f'sum(kube_node_status_condition{{condition="{cond}",status="true"}})'
)
return {key: _scalar_trends(expr, _TREND_WINDOWS) for key, expr in conditions.items()}
def _pod_reason_totals(
reasons: dict[str, str],
series: str,
) -> dict[str, dict[str, dict[str, float | None]]]:
totals: dict[str, dict[str, dict[str, float | None]]] = {}
for key, reason in reasons.items():
expr = f'sum({series}{{reason="{reason}"}})'
totals[key] = _scalar_trends(expr, _TREND_WINDOWS)
return totals
def _node_usage_exprs() -> dict[str, str]:
return {
"cpu": (
f'avg by (node) (((1 - avg by (instance) (rate(node_cpu_seconds_total{{mode="idle"}}[{_RATE_WINDOW}]))) * 100) '
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))'
),
"ram": (
'avg by (node) ((avg by (instance) ((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) '
'/ node_memory_MemTotal_bytes * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))'
),
"net": (
f'avg by (node) ((sum by (instance) (rate(node_network_receive_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]) '
f'+ rate(node_network_transmit_bytes_total{{device!~"lo"}}[{_RATE_WINDOW}]))) * on(instance) group_left(node) '
'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))'
),
"io": (
f'avg by (node) ((sum by (instance) (rate(node_disk_read_bytes_total[{_RATE_WINDOW}]) + rate(node_disk_written_bytes_total[{_RATE_WINDOW}]))) '
'* on(instance) group_left(node) label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))'
),
"disk": (
'avg by (node) (((1 - avg by (instance) (node_filesystem_avail_bytes{mountpoint="/",fstype!~"tmpfs|overlay"} '
'/ node_filesystem_size_bytes{mountpoint="/",fstype!~"tmpfs|overlay"})) * 100) * on(instance) group_left(node) '
'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)"))'
),
}
def _namespace_usage_exprs() -> dict[str, str]:
return {
"cpu": f'sum by (namespace) (rate(container_cpu_usage_seconds_total{{namespace!=""}}[{_RATE_WINDOW}]))',
"mem": 'sum by (namespace) (container_memory_working_set_bytes{namespace!=""})',
}
def _namespace_request_exprs() -> dict[str, str]:
return {
"cpu_requests": "sum by (namespace) (kube_pod_container_resource_requests_cpu_cores)",
"mem_requests": "sum by (namespace) (kube_pod_container_resource_requests_memory_bytes)",
}
def _restart_namespace_trend(window: str) -> list[dict[str, Any]]:
entries = _vm_vector(
f"topk({_TREND_NAMESPACE_LIMIT}, sum by (namespace) (increase(kube_pod_container_status_restarts_total[{window}])))"
)
entries = _filter_namespace_vector(entries)
return _vector_to_named(entries, "namespace", "namespace")
def _job_failure_trend(window: str) -> list[dict[str, Any]]:
entries = _vm_vector(
f"topk({_TREND_JOB_LIMIT}, sum by (namespace,job_name) (increase(kube_job_status_failed[{window}])))"
)
output: list[dict[str, Any]] = []
for item in entries:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
job = metric.get("job_name")
if not isinstance(namespace, str) or not isinstance(job, str):
continue
output.append(
{
"namespace": namespace,
"job": job,
"value": item.get("value"),
}
)
output.sort(key=lambda item: (-(item.get("value") or 0), item.get("namespace") or "", item.get("job") or ""))
return output
def _pod_reason_entries(expr: str, limit: int) -> list[dict[str, Any]]:
entries = _vm_vector(f"topk({limit}, sum by (namespace,pod) ({expr}))")
output: list[dict[str, Any]] = []
for item in entries:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
pod = metric.get("pod")
if not isinstance(namespace, str) or not isinstance(pod, str):
continue
output.append(
{
"namespace": namespace,
"pod": pod,
"value": item.get("value"),
}
)
output.sort(key=lambda item: (-(item.get("value") or 0), item.get("namespace") or "", item.get("pod") or ""))
return output
def _namespace_reason_entries(expr: str, limit: int) -> list[dict[str, Any]]:
entries = _vm_vector(f"topk({limit}, sum by (namespace) ({expr}))")
entries = _filter_namespace_vector(entries)
return _vector_to_named(entries, "namespace", "namespace")
def _pod_waiting_now() -> dict[str, list[dict[str, Any]]]:
output: dict[str, list[dict[str, Any]]] = {}
for key, reason in _POD_WAITING_REASONS.items():
expr = f'kube_pod_container_status_waiting_reason{{reason="{reason}"}}'
output[key] = _pod_reason_entries(expr, _POD_REASON_LIMIT)
return output
def _pod_waiting_trends() -> dict[str, dict[str, list[dict[str, Any]]]]:
trends: dict[str, dict[str, list[dict[str, Any]]]] = {}
for key, reason in _POD_WAITING_REASONS.items():
expr = f'kube_pod_container_status_waiting_reason{{reason="{reason}"}}'
trends[key] = {
window: _pod_reason_entries(f"max_over_time(({expr})[{window}])", _POD_REASON_TREND_LIMIT)
for window in _TREND_WINDOWS
}
return trends
def _pod_terminated_now() -> dict[str, list[dict[str, Any]]]:
output: dict[str, list[dict[str, Any]]] = {}
for key, reason in _POD_TERMINATED_REASONS.items():
expr = f'kube_pod_container_status_terminated_reason{{reason="{reason}"}}'
output[key] = _pod_reason_entries(expr, _POD_REASON_LIMIT)
return output
def _pod_terminated_trends() -> dict[str, dict[str, list[dict[str, Any]]]]:
trends: dict[str, dict[str, list[dict[str, Any]]]] = {}
for key, reason in _POD_TERMINATED_REASONS.items():
expr = f'kube_pod_container_status_terminated_reason{{reason="{reason}"}}'
trends[key] = {
window: _pod_reason_entries(f"max_over_time(({expr})[{window}])", _POD_REASON_TREND_LIMIT)
for window in _TREND_WINDOWS
}
return trends
def _pods_phase_trends() -> dict[str, dict[str, dict[str, float | None]]]:
phases = {
"running": "sum(kube_pod_status_phase{phase=\"Running\"})",
"pending": "sum(kube_pod_status_phase{phase=\"Pending\"})",
"failed": "sum(kube_pod_status_phase{phase=\"Failed\"})",
}
trends: dict[str, dict[str, dict[str, float | None]]] = {}
for window in _TREND_WINDOWS:
window_entry: dict[str, dict[str, float | None]] = {}
for name, expr in phases.items():
window_entry[name] = {
"avg": _vm_scalar_window(expr, window, "avg_over_time"),
"max": _vm_scalar_window(expr, window, "max_over_time"),
}
trends[window] = window_entry
return trends
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -0,0 +1,292 @@
from __future__ import annotations
from .common import *
from .nodes import *
from .k8s import *
from .pods import *
def _fetch_nodes(errors: list[str]) -> tuple[dict[str, Any], list[dict[str, Any]], dict[str, Any]]:
nodes: dict[str, Any] = {}
details: list[dict[str, Any]] = []
summary: dict[str, Any] = {}
try:
payload = get_json("/api/v1/nodes")
nodes = _summarize_nodes(payload)
details = _node_details(payload)
summary = _summarize_inventory(details)
except Exception as exc:
errors.append(f"nodes: {exc}")
return nodes, details, summary
def _fetch_flux(errors: list[str]) -> dict[str, Any]:
try:
payload = get_json(
"/apis/kustomize.toolkit.fluxcd.io/v1/namespaces/flux-system/kustomizations"
)
return _summarize_kustomizations(payload)
except Exception as exc:
errors.append(f"flux: {exc}")
return {}
def _fetch_pods(
errors: list[str],
) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]], dict[str, Any]]:
workloads: list[dict[str, Any]] = []
namespace_pods: list[dict[str, Any]] = []
namespace_nodes: list[dict[str, Any]] = []
node_pods: list[dict[str, Any]] = []
pod_issues: dict[str, Any] = {}
try:
pods_payload = get_json("/api/v1/pods?limit=5000")
workloads = _summarize_workloads(pods_payload)
namespace_pods = _summarize_namespace_pods(pods_payload)
namespace_nodes = _summarize_namespace_nodes(pods_payload)
node_pods = _summarize_node_pods(pods_payload)
pod_issues = _summarize_pod_issues(pods_payload)
except Exception as exc:
errors.append(f"pods: {exc}")
return workloads, namespace_pods, namespace_nodes, node_pods, pod_issues
def _fetch_jobs(errors: list[str]) -> dict[str, Any]:
try:
jobs_payload = get_json("/apis/batch/v1/jobs?limit=2000")
return _summarize_jobs(jobs_payload)
except Exception as exc:
errors.append(f"jobs: {exc}")
return {}
def _summarize_longhorn_volumes(payload: dict[str, Any]) -> dict[str, Any]:
items = _items(payload)
if not items:
return {}
by_state: dict[str, int] = {}
by_robustness: dict[str, int] = {}
degraded: list[dict[str, Any]] = []
attached_count = 0
detached_count = 0
degraded_count = 0
for volume in items:
metadata = volume.get("metadata") if isinstance(volume.get("metadata"), dict) else {}
status = volume.get("status") if isinstance(volume.get("status"), dict) else {}
spec = volume.get("spec") if isinstance(volume.get("spec"), dict) else {}
name = metadata.get("name") if isinstance(metadata.get("name"), str) else ""
if not name:
continue
state = status.get("state") if isinstance(status.get("state"), str) else "unknown"
robustness = (
status.get("robustness") if isinstance(status.get("robustness"), str) else "unknown"
)
state_lower = state.lower()
robustness_lower = robustness.lower()
by_state[state] = by_state.get(state, 0) + 1
by_robustness[robustness] = by_robustness.get(robustness, 0) + 1
if state_lower == "attached":
attached_count += 1
elif state_lower == "detached":
detached_count += 1
if robustness_lower in {"degraded", "faulted"}:
degraded_count += 1
degraded.append(
{
"name": name,
"state": state,
"robustness": robustness,
"size": spec.get("size"),
"actual_size": status.get("actualSize"),
}
)
degraded.sort(key=lambda item: item.get("name") or "")
return {
"total": len(items),
"by_state": by_state,
"by_robustness": by_robustness,
"attached_count": attached_count,
"detached_count": detached_count,
"degraded": degraded,
"degraded_count": degraded_count,
}
def _fetch_longhorn(errors: list[str]) -> dict[str, Any]:
try:
payload = get_json(
"/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes"
)
return _summarize_longhorn_volumes(payload)
except Exception as exc:
errors.append(f"longhorn: {exc}")
return {}
def _fetch_workload_health(errors: list[str]) -> dict[str, Any]:
try:
deployments_payload = get_json("/apis/apps/v1/deployments?limit=2000")
statefulsets_payload = get_json("/apis/apps/v1/statefulsets?limit=2000")
daemonsets_payload = get_json("/apis/apps/v1/daemonsets?limit=2000")
deployments = _summarize_deployments(deployments_payload)
statefulsets = _summarize_statefulsets(statefulsets_payload)
daemonsets = _summarize_daemonsets(daemonsets_payload)
return _summarize_workload_health(deployments, statefulsets, daemonsets)
except Exception as exc:
errors.append(f"workloads_health: {exc}")
return {}
def _fetch_events(errors: list[str]) -> dict[str, Any]:
try:
events_payload = get_json("/api/v1/events?limit=2000")
return _summarize_events(events_payload)
except Exception as exc:
errors.append(f"events: {exc}")
return {}
def _vm_query(expr: str) -> list[dict[str, Any]] | None:
base = settings.vm_url
if not base:
return None
url = f"{base.rstrip('/')}/api/v1/query"
params = {"query": expr}
with httpx.Client(timeout=settings.cluster_state_vm_timeout_sec) as client:
resp = client.get(url, params=params)
resp.raise_for_status()
payload = resp.json()
if payload.get("status") != "success":
return None
data = payload.get("data") if isinstance(payload.get("data"), dict) else {}
result = data.get("result")
return result if isinstance(result, list) else None
def _vm_scalar(expr: str) -> float | None:
result = _vm_query(expr)
if not result:
return None
value = result[0].get("value") if isinstance(result[0], dict) else None
if not isinstance(value, list) or len(value) < _VALUE_PAIR_LEN:
return None
try:
return float(value[1])
except (TypeError, ValueError):
return None
def _vm_vector(expr: str) -> list[dict[str, Any]]:
result = _vm_query(expr) or []
output: list[dict[str, Any]] = []
for item in result:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
value = item.get("value") if isinstance(item.get("value"), list) else []
if len(value) < _VALUE_PAIR_LEN:
continue
try:
numeric = float(value[1])
except (TypeError, ValueError):
continue
output.append({"metric": metric, "value": numeric})
return output
def _alert_entries(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in entries:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
value = item.get("value")
name = metric.get("alertname")
if not isinstance(name, str) or not name:
continue
severity = metric.get("severity") if isinstance(metric.get("severity"), str) else ""
output.append(
{
"alert": name,
"severity": severity,
"value": value,
}
)
output.sort(key=lambda item: (-(item.get("value") or 0), item.get("alert") or ""))
return output
def _vm_alerts_now() -> list[dict[str, Any]]:
entries = _vm_vector('sum by (alertname,severity) (ALERTS{alertstate="firing"})')
return _alert_entries(entries)[:_ALERT_TOP_LIMIT]
def _vm_alerts_trend(window: str) -> list[dict[str, Any]]:
entries = _vm_vector(
f"topk({_ALERT_TOP_LIMIT}, sum by (alertname,severity) (count_over_time(ALERTS{{alertstate=\"firing\"}}[{window}])))"
)
return _alert_entries(entries)
def _alertmanager_alerts(errors: list[str]) -> list[dict[str, Any]]:
base = settings.alertmanager_url
if not base:
return []
url = f"{base.rstrip('/')}/api/v2/alerts"
try:
with httpx.Client(timeout=settings.cluster_state_vm_timeout_sec) as client:
resp = client.get(url)
resp.raise_for_status()
payload = resp.json()
if isinstance(payload, list):
return [item for item in payload if isinstance(item, dict)]
except Exception as exc:
errors.append(f"alertmanager: {exc}")
return []
def _summarize_alerts(alerts: list[dict[str, Any]]) -> dict[str, Any]:
items: list[dict[str, Any]] = []
by_severity: dict[str, int] = {}
for alert in alerts:
labels = alert.get("labels") if isinstance(alert.get("labels"), dict) else {}
alertname = labels.get("alertname")
if not isinstance(alertname, str) or not alertname:
continue
severity = labels.get("severity") if isinstance(labels.get("severity"), str) else ""
items.append({"alert": alertname, "severity": severity})
if severity:
by_severity[severity] = by_severity.get(severity, 0) + 1
items.sort(key=lambda item: (item.get("severity") or "", item.get("alert") or ""))
return {
"total": len(items),
"by_severity": by_severity,
"items": items[:_ALERT_TOP_LIMIT],
}
def _filter_namespace_vector(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
output: list[dict[str, Any]] = []
for item in entries:
if not isinstance(item, dict):
continue
metric = item.get("metric") if isinstance(item.get("metric"), dict) else {}
namespace = metric.get("namespace")
if not isinstance(namespace, str) or not namespace:
continue
if namespace in _SYSTEM_NAMESPACES:
continue
output.append(item)
return output
def _vm_topk(expr: str, label_key: str) -> dict[str, Any] | None:
result = _vm_vector(expr)
if not result:
return None
metric = result[0].get("metric") if isinstance(result[0], dict) else {}
value = result[0].get("value")
label = metric.get(label_key) if isinstance(metric, dict) else None
return {"label": label or "", "value": value, "metric": metric}
__all__ = [name for name in globals() if not name.startswith("__")]

View File

@ -1,93 +1,47 @@
"""Compatibility facade for Ariadne comms workflows."""
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass
import base64
import time
import urllib.parse
from typing import Any from typing import Any
import httpx import httpx
import psycopg import psycopg
from ..settings import settings from ..settings import settings
from ..utils.logging import get_logger
from ..utils.name_generator import NameGenerator from ..utils.name_generator import NameGenerator
from .comms_guest import CommsGuestService
from .comms_room import CommsRoomService
from .comms_support import (
CommsSummary,
DisplayNameTarget,
MasGuestResult,
SynapseGuestResult,
SynapseUserRef,
_auth,
_canon_user,
_needs_rename_display,
_needs_rename_username,
)
__all__ = [
logger = get_logger(__name__) "CommsService",
"CommsSummary",
HTTP_OK = 200 "DisplayNameTarget",
HTTP_CREATED = 201 "MasGuestResult",
HTTP_ACCEPTED = 202 "SynapseGuestResult",
HTTP_NO_CONTENT = 204 "SynapseUserRef",
HTTP_BAD_REQUEST = 400 "_auth",
HTTP_NOT_FOUND = 404 "_canon_user",
HTTP_CONFLICT = 409 "_needs_rename_display",
"_needs_rename_username",
@dataclass(frozen=True) "psycopg",
class CommsSummary: "comms",
processed: int ]
renamed: int
pruned: int
skipped: int
detail: str = ""
@dataclass(frozen=True)
class MasGuestResult:
renamed: int
skipped: int
usernames: set[str]
@dataclass(frozen=True)
class SynapseGuestResult:
renamed: int
pruned: int
@dataclass(frozen=True)
class DisplayNameTarget:
room_id: str
user_id: str
name: str
in_room: bool
@dataclass(frozen=True)
class SynapseUserRef:
entry: dict[str, Any]
user_id: str
localpart: str
def _auth(token: str) -> dict[str, str]:
return {"Authorization": f"Bearer {token}"}
def _canon_user(user: str, server_name: str) -> str:
user = (user or "").strip()
if user.startswith("@") and ":" in user:
return user
user = user.lstrip("@")
if ":" in user:
return f"@{user}"
return f"@{user}:{server_name}"
def _needs_rename_username(username: str) -> bool:
return username.isdigit() or username.startswith("guest-")
def _needs_rename_display(display: str | None) -> bool:
if not display:
return True
return display.isdigit() or display.startswith("guest-")
class CommsService: class CommsService:
"""Keep the historical comms API while delegating to focused helpers."""
def __init__( def __init__(
self, self,
client_factory: type[httpx.Client] = httpx.Client, client_factory: type[httpx.Client] = httpx.Client,
@ -95,849 +49,24 @@ class CommsService:
) -> None: ) -> None:
self._client_factory = client_factory self._client_factory = client_factory
self._name_generator = name_generator or NameGenerator() self._name_generator = name_generator or NameGenerator()
self._guest = CommsGuestService(client_factory, settings, self._name_generator)
def _pick_guest_name(self, existing: set[str]) -> str | None: self._room = CommsRoomService(client_factory, settings)
return self._name_generator.unique(existing)
def _client(self) -> httpx.Client:
return self._client_factory(timeout=settings.comms_timeout_sec)
def _admin_token(self, fallback: str) -> str:
token = getattr(settings, "comms_synapse_admin_token", "")
return token if token else fallback
def _mas_admin_token(self, client: httpx.Client) -> str:
if not settings.comms_mas_admin_client_id or not settings.comms_mas_admin_client_secret:
raise RuntimeError("mas admin client credentials missing")
basic = base64.b64encode(
f"{settings.comms_mas_admin_client_id}:{settings.comms_mas_admin_client_secret}".encode()
).decode()
last_err: Exception | None = None
for attempt in range(5):
try:
resp = client.post(
settings.comms_mas_token_url,
headers={"Authorization": f"Basic {basic}"},
data={"grant_type": "client_credentials", "scope": "urn:mas:admin"},
)
resp.raise_for_status()
payload = resp.json()
token = payload.get("access_token")
if not isinstance(token, str) or not token:
raise RuntimeError("missing mas access token")
return token
except Exception as exc: # noqa: BLE001
last_err = exc
time.sleep(2**attempt)
raise RuntimeError(str(last_err) if last_err else "mas admin token failed")
def _mas_user_id(self, client: httpx.Client, token: str, username: str) -> str:
url = f"{settings.comms_mas_admin_api_base}/users/by-username/{urllib.parse.quote(username)}"
resp = client.get(url, headers=_auth(token))
resp.raise_for_status()
payload = resp.json()
return payload["data"]["id"]
def _mas_personal_session(self, client: httpx.Client, token: str, user_id: str) -> tuple[str, str]:
resp = client.post(
f"{settings.comms_mas_admin_api_base}/personal-sessions",
headers=_auth(token),
json={
"actor_user_id": user_id,
"human_name": "guest-name-randomizer",
"scope": "urn:matrix:client:api:*",
"expires_in": 300,
},
)
resp.raise_for_status()
payload = resp.json().get("data", {})
session_id = payload.get("id")
attrs = (payload.get("attributes") or {}) if isinstance(payload, dict) else {}
access_token = attrs.get("access_token")
if not isinstance(access_token, str) or not isinstance(session_id, str):
raise RuntimeError("invalid personal session response")
return access_token, session_id
def _mas_revoke_session(self, client: httpx.Client, token: str, session_id: str) -> None:
try:
client.post(
f"{settings.comms_mas_admin_api_base}/personal-sessions/{urllib.parse.quote(session_id)}/revoke",
headers=_auth(token),
json={},
)
except Exception:
return
def _resolve_alias(self, client: httpx.Client, token: str, alias: str) -> str:
resp = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/room/{urllib.parse.quote(alias)}",
headers=_auth(token),
)
resp.raise_for_status()
payload = resp.json()
return payload["room_id"]
def _room_members(self, client: httpx.Client, token: str, room_id: str) -> tuple[set[str], set[str]]:
resp = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/members",
headers=_auth(token),
)
resp.raise_for_status()
payload = resp.json()
members: set[str] = set()
existing: set[str] = set()
for ev in payload.get("chunk", []) or []:
user_id = ev.get("state_key")
if isinstance(user_id, str) and user_id:
members.add(user_id)
display = (ev.get("content") or {}).get("displayname")
if isinstance(display, str) and display:
existing.add(display)
return members, existing
def _mas_list_users(self, client: httpx.Client, token: str) -> list[dict[str, Any]]:
users: list[dict[str, Any]] = []
cursor = None
while True:
url = f"{settings.comms_mas_admin_api_base}/users?page[size]=100"
if cursor:
url += f"&page[after]={urllib.parse.quote(cursor)}"
resp = client.get(url, headers=_auth(token))
resp.raise_for_status()
payload = resp.json()
data = payload.get("data") or []
if not isinstance(data, list) or not data:
break
users.extend([item for item in data if isinstance(item, dict)])
last = data[-1]
cursor = (
last.get("meta", {})
if isinstance(last, dict)
else {}
).get("page", {}).get("cursor")
if not cursor:
break
return users
def _synapse_list_users(self, client: httpx.Client, token: str) -> list[dict[str, Any]]:
users: list[dict[str, Any]] = []
from_token = None
admin_token = self._admin_token(token)
while True:
url = "{}/_synapse/admin/v2/users?local=true&deactivated=false&limit=100".format(
settings.comms_synapse_base
)
if from_token:
url += f"&from={urllib.parse.quote(from_token)}"
resp = client.get(url, headers=_auth(admin_token))
resp.raise_for_status()
payload = resp.json()
users.extend([item for item in payload.get("users", []) if isinstance(item, dict)])
from_token = payload.get("next_token")
if not from_token:
break
return users
def _should_prune_guest(self, entry: dict[str, Any], now_ms: int) -> bool:
if not entry.get("is_guest"):
return False
last_seen = entry.get("last_seen_ts")
if last_seen is None:
return False
try:
last_seen = int(last_seen)
except (TypeError, ValueError):
return False
stale_ms = int(settings.comms_guest_stale_days) * 24 * 60 * 60 * 1000
return now_ms - last_seen > stale_ms
def _prune_guest(self, client: httpx.Client, token: str, user_id: str) -> bool:
admin_token = self._admin_token(token)
try:
resp = client.delete(
f"{settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}",
headers=_auth(admin_token),
params={"erase": "true"},
)
except Exception as exc: # noqa: BLE001
logger.info(
"guest prune failed",
extra={"event": "comms_guest_prune", "status": "error", "detail": str(exc)},
)
return False
if resp.status_code in (HTTP_OK, HTTP_ACCEPTED, HTTP_NO_CONTENT, HTTP_NOT_FOUND):
return True
logger.info(
"guest prune failed",
extra={
"event": "comms_guest_prune",
"status": "error",
"detail": f"{resp.status_code} {resp.text}",
},
)
return False
def _get_displayname(self, client: httpx.Client, token: str, user_id: str) -> str | None:
resp = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/profile/{urllib.parse.quote(user_id)}",
headers=_auth(token),
)
resp.raise_for_status()
return resp.json().get("displayname")
def _get_displayname_admin(self, client: httpx.Client, token: str, user_id: str) -> str | None:
admin_token = self._admin_token(token)
resp = client.get(
f"{settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}",
headers=_auth(admin_token),
)
if resp.status_code == HTTP_NOT_FOUND:
return None
resp.raise_for_status()
return resp.json().get("displayname")
def _set_displayname(
self,
client: httpx.Client,
token: str,
target: DisplayNameTarget,
) -> None:
resp = client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/profile/{urllib.parse.quote(target.user_id)}/displayname",
headers=_auth(token),
json={"displayname": target.name},
)
resp.raise_for_status()
if not target.in_room:
return
state_url = (
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(target.room_id)}"
f"/state/m.room.member/{urllib.parse.quote(target.user_id)}"
)
client.put(
state_url,
headers=_auth(token),
json={"membership": "join", "displayname": target.name},
)
def _set_displayname_admin(self, client: httpx.Client, token: str, user_id: str, name: str) -> bool:
admin_token = self._admin_token(token)
resp = client.put(
f"{settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}",
headers=_auth(admin_token),
json={"displayname": name},
)
return resp.status_code in (HTTP_OK, HTTP_CREATED, HTTP_NO_CONTENT)
def _db_rename_numeric(self, existing: set[str]) -> int:
if not settings.comms_synapse_db_password:
return 0
renamed = 0
conn = psycopg.connect(
host=settings.comms_synapse_db_host,
port=settings.comms_synapse_db_port,
dbname=settings.comms_synapse_db_name,
user=settings.comms_synapse_db_user,
password=settings.comms_synapse_db_password,
)
try:
with conn:
with conn.cursor() as cur:
pattern = f"^@\\d+:{settings.comms_server_name}$"
cur.execute(
"SELECT user_id, full_user_id, displayname FROM profiles WHERE full_user_id ~ %s",
(pattern,),
)
profile_rows = cur.fetchall()
profile_index = {row[1]: row for row in profile_rows}
for _user_id, full_user_id, display in profile_rows:
if display and not _needs_rename_display(display):
continue
new_name = self._pick_guest_name(existing)
if not new_name:
continue
cur.execute(
"UPDATE profiles SET displayname = %s WHERE full_user_id = %s",
(new_name, full_user_id),
)
renamed += 1
cur.execute(
"SELECT name FROM users WHERE name ~ %s",
(pattern,),
)
users = [row[0] for row in cur.fetchall()]
if not users:
return renamed
cur.execute(
"SELECT user_id, full_user_id FROM profiles WHERE full_user_id = ANY(%s)",
(users,),
)
for existing_full in cur.fetchall():
profile_index.setdefault(existing_full[1], existing_full)
for full_user_id in users:
if full_user_id in profile_index:
continue
localpart = full_user_id.split(":", 1)[0].lstrip("@")
new_name = self._pick_guest_name(existing)
if not new_name:
continue
cur.execute(
"INSERT INTO profiles (user_id, displayname, full_user_id) VALUES (%s, %s, %s) "
"ON CONFLICT (full_user_id) DO UPDATE SET displayname = EXCLUDED.displayname",
(localpart, new_name, full_user_id),
)
renamed += 1
finally:
conn.close()
return renamed
def _validate_guest_name_settings(self) -> None:
if not settings.comms_mas_admin_client_id or not settings.comms_mas_admin_client_secret:
raise RuntimeError("comms mas admin secret missing")
if not settings.comms_synapse_base:
raise RuntimeError("comms synapse base missing")
def _room_context(self, client: httpx.Client, token: str) -> tuple[str, set[str], set[str]]:
room_id = self._resolve_alias(client, token, settings.comms_room_alias)
members, existing = self._room_members(client, token, room_id)
return room_id, members, existing
def _rename_mas_guests(
self,
client: httpx.Client,
admin_token: str,
room_id: str,
members: set[str],
existing: set[str],
) -> MasGuestResult:
renamed = 0
skipped = 0
mas_usernames: set[str] = set()
users = self._mas_list_users(client, admin_token)
for user in users:
attrs = user.get("attributes") or {}
username = attrs.get("username") or ""
if isinstance(username, str) and username:
mas_usernames.add(username)
legacy_guest = attrs.get("legacy_guest")
if not isinstance(username, str) or not username:
skipped += 1
continue
if not (legacy_guest or _needs_rename_username(username)):
skipped += 1
continue
user_id = user.get("id")
if not isinstance(user_id, str) or not user_id:
skipped += 1
continue
full_user = f"@{username}:{settings.comms_server_name}"
access_token, session_id = self._mas_personal_session(client, admin_token, user_id)
try:
display = self._get_displayname(client, access_token, full_user)
if display and not _needs_rename_display(display):
skipped += 1
continue
new_name = self._pick_guest_name(existing)
if not new_name:
skipped += 1
continue
self._set_displayname(
client,
access_token,
DisplayNameTarget(
room_id=room_id,
user_id=full_user,
name=new_name,
in_room=full_user in members,
),
)
renamed += 1
finally:
self._mas_revoke_session(client, admin_token, session_id)
return MasGuestResult(renamed=renamed, skipped=skipped, usernames=mas_usernames)
def _synapse_entries(self, client: httpx.Client, token: str) -> list[dict[str, Any]]:
try:
return self._synapse_list_users(client, token)
except Exception as exc: # noqa: BLE001
logger.info(
"synapse admin list skipped",
extra={"event": "comms_guest_list", "status": "error", "detail": str(exc)},
)
return []
def _synapse_user_id(self, entry: dict[str, Any]) -> SynapseUserRef | None:
user_id = entry.get("name") or ""
if not isinstance(user_id, str) or not user_id.startswith("@"):
return None
localpart = user_id.split(":", 1)[0].lstrip("@")
return SynapseUserRef(entry=entry, user_id=user_id, localpart=localpart)
def _maybe_prune_synapse_guest(
self,
client: httpx.Client,
token: str,
entry: dict[str, Any],
user_id: str,
now_ms: int,
) -> bool:
if not entry.get("is_guest"):
return False
if not self._should_prune_guest(entry, now_ms):
return False
return self._prune_guest(client, token, user_id)
def _needs_synapse_rename(
self,
client: httpx.Client,
token: str,
user: SynapseUserRef,
mas_usernames: set[str],
) -> bool:
if user.localpart in mas_usernames:
return False
is_guest = user.entry.get("is_guest")
if not (is_guest or _needs_rename_username(user.localpart)):
return False
display = self._get_displayname_admin(client, token, user.user_id)
if display and not _needs_rename_display(display):
return False
return True
def _rename_synapse_user(
self,
client: httpx.Client,
token: str,
existing: set[str],
user_id: str,
) -> bool:
new_name = self._pick_guest_name(existing)
if not new_name:
return False
return self._set_displayname_admin(client, token, user_id, new_name)
def _rename_synapse_guests(
self,
client: httpx.Client,
token: str,
existing: set[str],
mas_usernames: set[str],
) -> SynapseGuestResult:
renamed = 0
pruned = 0
entries = self._synapse_entries(client, token)
now_ms = int(time.time() * 1000)
for entry in entries:
user_ref = self._synapse_user_id(entry)
if not user_ref:
continue
if self._maybe_prune_synapse_guest(client, token, user_ref.entry, user_ref.user_id, now_ms):
pruned += 1
continue
if not self._needs_synapse_rename(client, token, user_ref, mas_usernames):
continue
if self._rename_synapse_user(client, token, existing, user_ref.user_id):
renamed += 1
return SynapseGuestResult(renamed=renamed, pruned=pruned)
def run_guest_name_randomizer(self, wait: bool = True) -> dict[str, Any]: def run_guest_name_randomizer(self, wait: bool = True) -> dict[str, Any]:
self._validate_guest_name_settings() self._guest._db_rename_numeric = self._db_rename_numeric
return self._guest.run_guest_name_randomizer(wait=wait)
with self._client() as client:
admin_token = self._mas_admin_token(client)
seeder_id = self._mas_user_id(client, admin_token, settings.comms_seeder_user)
seeder_token, seeder_session = self._mas_personal_session(client, admin_token, seeder_id)
try:
room_id, members, existing = self._room_context(client, seeder_token)
mas_result = self._rename_mas_guests(client, admin_token, room_id, members, existing)
synapse_result = self._rename_synapse_guests(
client,
seeder_token,
existing,
mas_result.usernames,
)
db_renamed = self._db_rename_numeric(existing)
finally:
self._mas_revoke_session(client, admin_token, seeder_session)
renamed = mas_result.renamed + synapse_result.renamed + db_renamed
pruned = synapse_result.pruned
skipped = mas_result.skipped
processed = renamed + pruned + skipped
summary = CommsSummary(processed, renamed, pruned, skipped)
logger.info(
"comms guest name sync finished",
extra={
"event": "comms_guest_name",
"status": "ok",
"processed": summary.processed,
"renamed": summary.renamed,
"pruned": summary.pruned,
"skipped": summary.skipped,
},
)
return {"status": "ok", **summary.__dict__}
def run_pin_invite(self, wait: bool = True) -> dict[str, Any]: def run_pin_invite(self, wait: bool = True) -> dict[str, Any]:
if not settings.comms_seeder_password: return self._room.run_pin_invite(wait=wait)
raise RuntimeError("comms seeder password missing")
with self._client() as client:
token = self._login(client, settings.comms_seeder_user, settings.comms_seeder_password)
room_id = self._resolve_alias(client, token, settings.comms_room_alias)
pinned = self._get_pinned(client, token, room_id)
for event_id in pinned:
event = self._get_event(client, token, room_id, event_id)
if event and (event.get("content") or {}).get("body") == settings.comms_pin_message:
return {"status": "ok", "detail": "already pinned"}
event_id = self._send_message(client, token, room_id, settings.comms_pin_message)
if not event_id:
return {"status": "error", "detail": "pin event_id missing"}
self._pin_message(client, token, room_id, event_id)
return {"status": "ok", "detail": "pinned"}
def run_reset_room(self, wait: bool = True) -> dict[str, Any]: def run_reset_room(self, wait: bool = True) -> dict[str, Any]:
if not settings.comms_seeder_password: return self._room.run_reset_room(wait=wait)
raise RuntimeError("comms seeder password missing")
with self._client() as client:
token = self._login_with_retry(client, settings.comms_seeder_user, settings.comms_seeder_password)
old_room_id = self._resolve_alias(client, token, settings.comms_room_alias)
new_room_id = self._create_room(client, token, settings.comms_room_name)
self._set_room_state(client, token, new_room_id, "m.room.join_rules", {"join_rule": "public"})
self._set_room_state(client, token, new_room_id, "m.room.guest_access", {"guest_access": "can_join"})
self._set_room_state(
client,
token,
new_room_id,
"m.room.history_visibility",
{"history_visibility": "shared"},
)
self._set_room_state(client, token, new_room_id, "m.room.power_levels", self._power_levels())
self._delete_alias(client, token, settings.comms_room_alias)
self._put_alias(client, token, settings.comms_room_alias, new_room_id)
self._set_room_state(
client,
token,
new_room_id,
"m.room.canonical_alias",
{"alias": settings.comms_room_alias},
)
self._set_directory_visibility(client, token, new_room_id, "public")
bot_user_id = _canon_user(settings.comms_bot_user, settings.comms_server_name)
self._invite_user(client, token, new_room_id, bot_user_id)
for uid in self._list_joined_members(client, token, old_room_id):
if uid == _canon_user(settings.comms_seeder_user, settings.comms_server_name):
continue
localpart = uid.split(":", 1)[0].lstrip("@")
if localpart.isdigit():
continue
self._invite_user(client, token, new_room_id, uid)
event_id = self._send_message(client, token, new_room_id, settings.comms_pin_message)
if not event_id:
raise RuntimeError("pin message event_id missing")
self._set_room_state(client, token, new_room_id, "m.room.pinned_events", {"pinned": [event_id]})
self._set_directory_visibility(client, token, old_room_id, "private")
self._set_room_state(client, token, old_room_id, "m.room.join_rules", {"join_rule": "invite"})
self._set_room_state(client, token, old_room_id, "m.room.guest_access", {"guest_access": "forbidden"})
self._set_room_state(
client,
token,
old_room_id,
"m.room.tombstone",
{
"body": "Othrys has been reset. Please join the new room.",
"replacement_room": new_room_id,
},
)
self._send_message(
client,
token,
old_room_id,
"Othrys was reset. Join the new room at https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join",
)
return {"status": "ok", "detail": f"old_room_id={old_room_id} new_room_id={new_room_id}"}
def run_seed_room(self, wait: bool = True) -> dict[str, Any]: def run_seed_room(self, wait: bool = True) -> dict[str, Any]:
if not settings.comms_seeder_password or not settings.comms_bot_password: return self._room.run_seed_room(wait=wait)
raise RuntimeError("comms seeder/bot password missing")
with self._client() as client: def _db_rename_numeric(self, existing: set[str]) -> int:
token = self._login(client, settings.comms_seeder_user, settings.comms_seeder_password) return self._guest._db_rename_numeric(existing)
for user, password, admin in (
(settings.comms_seeder_user, settings.comms_seeder_password, True),
(settings.comms_bot_user, settings.comms_bot_password, False),
):
try:
self._ensure_user(client, token, user, password, admin)
except RuntimeError as exc:
message = str(exc)
if "You are not a server admin" in message:
logger.warning(
"comms seed room ensure skipped",
extra={"event": "comms_seed_room", "user": user, "detail": message},
)
continue
raise
room_id = self._ensure_room(client, token)
self._join_user(client, token, room_id, _canon_user(settings.comms_bot_user, settings.comms_server_name))
self._join_all_locals(client, token, room_id)
return {"status": "ok", "detail": "room seeded"}
def _login(self, client: httpx.Client, user: str, password: str) -> str:
resp = client.post(
f"{settings.comms_auth_base}/_matrix/client/v3/login",
json={
"type": "m.login.password",
"identifier": {"type": "m.id.user", "user": _canon_user(user, settings.comms_server_name)},
"password": password,
},
)
if resp.status_code != HTTP_OK:
raise RuntimeError(f"login failed: {resp.status_code} {resp.text}")
payload = resp.json()
token = payload.get("access_token")
if not isinstance(token, str) or not token:
raise RuntimeError("login missing token")
return token
def _login_with_retry(self, client: httpx.Client, user: str, password: str) -> str:
last: Exception | None = None
for attempt in range(1, 6):
try:
return self._login(client, user, password)
except Exception as exc: # noqa: BLE001
last = exc
time.sleep(attempt * 2)
raise RuntimeError(str(last) if last else "login failed")
def _get_pinned(self, client: httpx.Client, token: str, room_id: str) -> list[str]:
resp = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/m.room.pinned_events",
headers=_auth(token),
)
if resp.status_code == HTTP_NOT_FOUND:
return []
resp.raise_for_status()
pinned = resp.json().get("pinned", [])
return [item for item in pinned if isinstance(item, str)]
def _get_event(self, client: httpx.Client, token: str, room_id: str, event_id: str) -> dict[str, Any] | None:
resp = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/event/{urllib.parse.quote(event_id)}",
headers=_auth(token),
)
if resp.status_code == HTTP_NOT_FOUND:
return None
resp.raise_for_status()
return resp.json()
def _send_message(self, client: httpx.Client, token: str, room_id: str, body: str) -> str:
resp = client.post(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/send/m.room.message",
headers=_auth(token),
json={"msgtype": "m.text", "body": body},
)
resp.raise_for_status()
payload = resp.json()
event_id = payload.get("event_id")
return event_id if isinstance(event_id, str) else ""
def _pin_message(self, client: httpx.Client, token: str, room_id: str, event_id: str) -> None:
resp = client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/m.room.pinned_events",
headers=_auth(token),
json={"pinned": [event_id]},
)
resp.raise_for_status()
def _create_room(self, client: httpx.Client, token: str, name: str) -> str:
resp = client.post(
f"{settings.comms_synapse_base}/_matrix/client/v3/createRoom",
headers=_auth(token),
json={"preset": "public_chat", "name": name, "room_version": "11"},
)
resp.raise_for_status()
return resp.json()["room_id"]
def _set_room_state(self, client: httpx.Client, token: str, room_id: str, ev_type: str, content: dict[str, Any]) -> None:
resp = client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/{ev_type}",
headers=_auth(token),
json=content,
)
resp.raise_for_status()
def _set_directory_visibility(self, client: httpx.Client, token: str, room_id: str, visibility: str) -> None:
resp = client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/list/room/{urllib.parse.quote(room_id)}",
headers=_auth(token),
json={"visibility": visibility},
)
resp.raise_for_status()
def _delete_alias(self, client: httpx.Client, token: str, alias: str) -> None:
resp = client.delete(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/room/{urllib.parse.quote(alias)}",
headers=_auth(token),
)
if resp.status_code in (HTTP_OK, HTTP_ACCEPTED, HTTP_NOT_FOUND):
return
resp.raise_for_status()
def _put_alias(self, client: httpx.Client, token: str, alias: str, room_id: str) -> None:
resp = client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/room/{urllib.parse.quote(alias)}",
headers=_auth(token),
json={"room_id": room_id},
)
resp.raise_for_status()
def _list_joined_members(self, client: httpx.Client, token: str, room_id: str) -> list[str]:
resp = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/members?membership=join",
headers=_auth(token),
)
resp.raise_for_status()
members = []
for ev in resp.json().get("chunk", []) or []:
if ev.get("type") != "m.room.member":
continue
uid = ev.get("state_key")
if isinstance(uid, str) and uid.startswith("@"):
members.append(uid)
return members
def _invite_user(self, client: httpx.Client, token: str, room_id: str, user_id: str) -> None:
resp = client.post(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/invite",
headers=_auth(token),
json={"user_id": user_id},
)
if resp.status_code in (HTTP_OK, HTTP_ACCEPTED):
return
resp.raise_for_status()
def _power_levels(self) -> dict[str, Any]:
return {
"ban": 50,
"events": {
"m.room.avatar": 50,
"m.room.canonical_alias": 50,
"m.room.encryption": 100,
"m.room.history_visibility": 100,
"m.room.name": 50,
"m.room.power_levels": 100,
"m.room.server_acl": 100,
"m.room.tombstone": 100,
},
"events_default": 0,
"historical": 100,
"invite": 50,
"kick": 50,
"m.call.invite": 50,
"redact": 50,
"state_default": 50,
"users": { _canon_user(settings.comms_seeder_user, settings.comms_server_name): 100 },
"users_default": 0,
}
def _ensure_user(self, client: httpx.Client, token: str, localpart: str, password: str, admin: bool) -> None:
admin_token = self._admin_token(token)
user_id = _canon_user(localpart, settings.comms_server_name)
url = f"{settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}"
resp = client.get(url, headers=_auth(admin_token))
if resp.status_code == HTTP_OK:
return
payload = {"password": password, "admin": admin, "deactivated": False}
create = client.put(url, headers=_auth(admin_token), json=payload)
if create.status_code not in (HTTP_OK, HTTP_CREATED):
raise RuntimeError(f"create user {user_id} failed: {create.status_code} {create.text}")
def _ensure_room(self, client: httpx.Client, token: str) -> str:
alias = settings.comms_room_alias
alias_enc = urllib.parse.quote(alias)
exists = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/room/{alias_enc}",
headers=_auth(token),
)
if exists.status_code == HTTP_OK:
room_id = exists.json()["room_id"]
else:
create = client.post(
f"{settings.comms_synapse_base}/_matrix/client/v3/createRoom",
headers=_auth(token),
json={
"preset": "public_chat",
"name": settings.comms_room_name,
"room_alias_name": alias.split(":", 1)[0].lstrip("#"),
"initial_state": [],
"power_level_content_override": {
"events_default": 0,
"users_default": 0,
"state_default": 50,
},
},
)
if create.status_code not in (HTTP_OK, HTTP_CONFLICT):
raise RuntimeError(f"create room failed: {create.status_code} {create.text}")
exists = client.get(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/room/{alias_enc}",
headers=_auth(token),
)
room_id = exists.json()["room_id"]
state_events = [
("m.room.join_rules", {"join_rule": "public"}),
("m.room.guest_access", {"guest_access": "can_join"}),
("m.room.history_visibility", {"history_visibility": "shared"}),
("m.room.canonical_alias", {"alias": alias}),
]
for ev_type, content in state_events:
client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/rooms/{room_id}/state/{ev_type}",
headers=_auth(token),
json=content,
)
client.put(
f"{settings.comms_synapse_base}/_matrix/client/v3/directory/list/room/{room_id}",
headers=_auth(token),
json={"visibility": "public"},
)
return room_id
def _join_user(self, client: httpx.Client, token: str, room_id: str, user_id: str) -> None:
admin_token = self._admin_token(token)
client.post(
f"{settings.comms_synapse_base}/_synapse/admin/v1/join/{urllib.parse.quote(room_id)}",
headers=_auth(admin_token),
json={"user_id": user_id},
)
def _join_all_locals(self, client: httpx.Client, token: str, room_id: str) -> None:
users: list[str] = []
from_token = None
admin_token = self._admin_token(token)
while True:
url = f"{settings.comms_synapse_base}/_synapse/admin/v2/users?local=true&deactivated=false&limit=100"
if from_token:
url += f"&from={from_token}"
resp = client.get(url, headers=_auth(admin_token))
payload = resp.json()
users.extend([u["name"] for u in payload.get("users", []) if isinstance(u, dict) and u.get("name")])
from_token = payload.get("next_token")
if not from_token:
break
for uid in users:
self._join_user(client, token, room_id, uid)
comms = CommsService() comms = CommsService()

View File

@ -0,0 +1,487 @@
"""Guest name randomizer helpers."""
from __future__ import annotations
from typing import Any
import base64
import time
import urllib.parse
import httpx
import psycopg
from ..utils.logging import get_logger
from ..utils.name_generator import NameGenerator
from .comms_support import (
CommsServiceBase,
CommsSummary,
DisplayNameTarget,
MasGuestResult,
SynapseGuestResult,
SynapseUserRef,
_auth,
_needs_rename_display,
_needs_rename_username,
)
logger = get_logger(__name__)
HTTP_NOT_FOUND = 404
class CommsGuestService(CommsServiceBase):
"""Rename and prune guest accounts across MAS, Synapse, and the database."""
def __init__(self, client_factory: type[httpx.Client], settings: Any, name_generator: NameGenerator) -> None:
super().__init__(client_factory, settings)
self._name_generator = name_generator
def _pick_guest_name(self, existing: set[str]) -> str | None:
return self._name_generator.unique(existing)
def _mas_admin_token(self, client: httpx.Client) -> str:
if not self._settings.comms_mas_admin_client_id or not self._settings.comms_mas_admin_client_secret:
raise RuntimeError("mas admin client credentials missing")
basic = base64.b64encode(
f"{self._settings.comms_mas_admin_client_id}:{self._settings.comms_mas_admin_client_secret}".encode()
).decode()
last_err: Exception | None = None
for attempt in range(5):
try:
resp = client.post(
self._settings.comms_mas_token_url,
headers={"Authorization": f"Basic {basic}"},
data={"grant_type": "client_credentials", "scope": "urn:mas:admin"},
)
resp.raise_for_status()
payload = resp.json()
token = payload.get("access_token")
if not isinstance(token, str) or not token:
raise RuntimeError("missing mas access token")
return token
except Exception as exc: # noqa: BLE001
last_err = exc
time.sleep(2**attempt)
raise RuntimeError(str(last_err) if last_err else "mas admin token failed")
def _mas_user_id(self, client: httpx.Client, token: str, username: str) -> str:
url = f"{self._settings.comms_mas_admin_api_base}/users/by-username/{urllib.parse.quote(username)}"
resp = client.get(url, headers=_auth(token))
resp.raise_for_status()
payload = resp.json()
return payload["data"]["id"]
def _mas_personal_session(self, client: httpx.Client, token: str, user_id: str) -> tuple[str, str]:
resp = client.post(
f"{self._settings.comms_mas_admin_api_base}/personal-sessions",
headers=_auth(token),
json={
"actor_user_id": user_id,
"human_name": "guest-name-randomizer",
"scope": "urn:matrix:client:api:*",
"expires_in": 300,
},
)
resp.raise_for_status()
payload = resp.json().get("data", {})
session_id = payload.get("id")
attrs = (payload.get("attributes") or {}) if isinstance(payload, dict) else {}
access_token = attrs.get("access_token")
if not isinstance(access_token, str) or not isinstance(session_id, str):
raise RuntimeError("invalid personal session response")
return access_token, session_id
def _mas_revoke_session(self, client: httpx.Client, token: str, session_id: str) -> None:
try:
client.post(
f"{self._settings.comms_mas_admin_api_base}/personal-sessions/{urllib.parse.quote(session_id)}/revoke",
headers=_auth(token),
json={},
)
except Exception:
return
def _mas_list_users(self, client: httpx.Client, token: str) -> list[dict[str, Any]]:
users: list[dict[str, Any]] = []
cursor = None
while True:
url = f"{self._settings.comms_mas_admin_api_base}/users?page[size]=100"
if cursor:
url += f"&page[after]={urllib.parse.quote(cursor)}"
resp = client.get(url, headers=_auth(token))
resp.raise_for_status()
payload = resp.json()
data = payload.get("data") or []
if not isinstance(data, list) or not data:
break
users.extend([item for item in data if isinstance(item, dict)])
last = data[-1]
cursor = (
last.get("meta", {})
if isinstance(last, dict)
else {}
).get("page", {}).get("cursor")
if not cursor:
break
return users
def _synapse_list_users(self, client: httpx.Client, token: str) -> list[dict[str, Any]]:
users: list[dict[str, Any]] = []
from_token = None
admin_token = self._admin_token(token)
while True:
url = "{}/_synapse/admin/v2/users?local=true&deactivated=false&limit=100".format(
self._settings.comms_synapse_base
)
if from_token:
url += f"&from={urllib.parse.quote(from_token)}"
resp = client.get(url, headers=_auth(admin_token))
resp.raise_for_status()
payload = resp.json()
users.extend([item for item in payload.get("users", []) if isinstance(item, dict)])
from_token = payload.get("next_token")
if not from_token:
break
return users
def _should_prune_guest(self, entry: dict[str, Any], now_ms: int) -> bool:
if not entry.get("is_guest"):
return False
last_seen = entry.get("last_seen_ts")
if last_seen is None:
return False
try:
last_seen = int(last_seen)
except (TypeError, ValueError):
return False
stale_ms = int(self._settings.comms_guest_stale_days) * 24 * 60 * 60 * 1000
return now_ms - last_seen > stale_ms
def _prune_guest(self, client: httpx.Client, token: str, user_id: str) -> bool:
admin_token = self._admin_token(token)
try:
resp = client.delete(
f"{self._settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}",
headers=_auth(admin_token),
params={"erase": "true"},
)
except Exception as exc: # noqa: BLE001
logger.info(
"guest prune failed",
extra={"event": "comms_guest_prune", "status": "error", "detail": str(exc)},
)
return False
if resp.status_code in (200, 202, 204, 404):
return True
logger.info(
"guest prune failed",
extra={
"event": "comms_guest_prune",
"status": "error",
"detail": f"{resp.status_code} {resp.text}",
},
)
return False
def _get_displayname(self, client: httpx.Client, token: str, user_id: str) -> str | None:
resp = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/profile/{urllib.parse.quote(user_id)}",
headers=_auth(token),
)
resp.raise_for_status()
return resp.json().get("displayname")
def _get_displayname_admin(self, client: httpx.Client, token: str, user_id: str) -> str | None:
admin_token = self._admin_token(token)
resp = client.get(
f"{self._settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}",
headers=_auth(admin_token),
)
if resp.status_code == HTTP_NOT_FOUND:
return None
resp.raise_for_status()
return resp.json().get("displayname")
def _set_displayname(
self,
client: httpx.Client,
token: str,
target: DisplayNameTarget,
) -> None:
resp = client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/profile/{urllib.parse.quote(target.user_id)}/displayname",
headers=_auth(token),
json={"displayname": target.name},
)
resp.raise_for_status()
if not target.in_room:
return
state_url = (
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(target.room_id)}"
f"/state/m.room.member/{urllib.parse.quote(target.user_id)}"
)
client.put(
state_url,
headers=_auth(token),
json={"membership": "join", "displayname": target.name},
)
def _set_displayname_admin(self, client: httpx.Client, token: str, user_id: str, name: str) -> bool:
admin_token = self._admin_token(token)
resp = client.put(
f"{self._settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}",
headers=_auth(admin_token),
json={"displayname": name},
)
return resp.status_code in (200, 201, 204)
def _db_rename_numeric(self, existing: set[str]) -> int:
if not getattr(self._settings, "comms_synapse_db_password", ""):
return 0
renamed = 0
conn = psycopg.connect(
host=self._settings.comms_synapse_db_host,
port=self._settings.comms_synapse_db_port,
dbname=self._settings.comms_synapse_db_name,
user=self._settings.comms_synapse_db_user,
password=self._settings.comms_synapse_db_password,
)
try:
with conn:
with conn.cursor() as cur:
pattern = f"^@\\d+:{self._settings.comms_server_name}$"
cur.execute(
"SELECT user_id, full_user_id, displayname FROM profiles WHERE full_user_id ~ %s",
(pattern,),
)
profile_rows = cur.fetchall()
profile_index = {row[1]: row for row in profile_rows}
for _user_id, full_user_id, display in profile_rows:
if display and not _needs_rename_display(display):
continue
new_name = self._pick_guest_name(existing)
if not new_name:
continue
cur.execute(
"UPDATE profiles SET displayname = %s WHERE full_user_id = %s",
(new_name, full_user_id),
)
renamed += 1
cur.execute(
"SELECT name FROM users WHERE name ~ %s",
(pattern,),
)
users = [row[0] for row in cur.fetchall()]
if not users:
return renamed
cur.execute(
"SELECT user_id, full_user_id FROM profiles WHERE full_user_id = ANY(%s)",
(users,),
)
for existing_full in cur.fetchall():
profile_index.setdefault(existing_full[1], existing_full)
for full_user_id in users:
if full_user_id in profile_index:
continue
localpart = full_user_id.split(":", 1)[0].lstrip("@")
new_name = self._pick_guest_name(existing)
if not new_name:
continue
cur.execute(
"INSERT INTO profiles (user_id, displayname, full_user_id) VALUES (%s, %s, %s) "
"ON CONFLICT (full_user_id) DO UPDATE SET displayname = EXCLUDED.displayname",
(localpart, new_name, full_user_id),
)
renamed += 1
finally:
conn.close()
return renamed
def _validate_guest_name_settings(self) -> None:
if not self._settings.comms_mas_admin_client_id or not self._settings.comms_mas_admin_client_secret:
raise RuntimeError("comms mas admin secret missing")
if not self._settings.comms_synapse_base:
raise RuntimeError("comms synapse base missing")
def _room_context(self, client: httpx.Client, token: str) -> tuple[str, set[str], set[str]]:
room_id = self._resolve_alias(client, token, self._settings.comms_room_alias)
members, existing = self._room_members(client, token, room_id)
return room_id, members, existing
def _rename_mas_guests(
self,
client: httpx.Client,
admin_token: str,
room_id: str,
members: set[str],
existing: set[str],
) -> MasGuestResult:
renamed = 0
skipped = 0
mas_usernames: set[str] = set()
users = self._mas_list_users(client, admin_token)
for user in users:
attrs = user.get("attributes") or {}
username = attrs.get("username") or ""
if isinstance(username, str) and username:
mas_usernames.add(username)
legacy_guest = attrs.get("legacy_guest")
if not isinstance(username, str) or not username:
skipped += 1
continue
if not (legacy_guest or _needs_rename_username(username)):
skipped += 1
continue
user_id = user.get("id")
if not isinstance(user_id, str) or not user_id:
skipped += 1
continue
full_user = f"@{username}:{self._settings.comms_server_name}"
access_token, session_id = self._mas_personal_session(client, admin_token, user_id)
try:
display = self._get_displayname(client, access_token, full_user)
if display and not _needs_rename_display(display):
skipped += 1
continue
new_name = self._pick_guest_name(existing)
if not new_name:
skipped += 1
continue
self._set_displayname(
client,
access_token,
DisplayNameTarget(
room_id=room_id,
user_id=full_user,
name=new_name,
in_room=full_user in members,
),
)
renamed += 1
finally:
self._mas_revoke_session(client, admin_token, session_id)
return MasGuestResult(renamed=renamed, skipped=skipped, usernames=mas_usernames)
def _synapse_entries(self, client: httpx.Client, token: str) -> list[dict[str, Any]]:
try:
return self._synapse_list_users(client, token)
except Exception as exc: # noqa: BLE001
logger.info(
"synapse admin list skipped",
extra={"event": "comms_guest_list", "status": "error", "detail": str(exc)},
)
return []
def _synapse_user_id(self, entry: dict[str, Any]) -> SynapseUserRef | None:
user_id = entry.get("name") or ""
if not isinstance(user_id, str) or not user_id.startswith("@"):
return None
localpart = user_id.split(":", 1)[0].lstrip("@")
return SynapseUserRef(entry=entry, user_id=user_id, localpart=localpart)
def _maybe_prune_synapse_guest(
self,
client: httpx.Client,
token: str,
entry: dict[str, Any],
user_id: str,
now_ms: int,
) -> bool:
if not entry.get("is_guest"):
return False
if not self._should_prune_guest(entry, now_ms):
return False
return self._prune_guest(client, token, user_id)
def _needs_synapse_rename(
self,
client: httpx.Client,
token: str,
user: SynapseUserRef,
mas_usernames: set[str],
) -> bool:
if user.localpart in mas_usernames:
return False
is_guest = user.entry.get("is_guest")
if not (is_guest or _needs_rename_username(user.localpart)):
return False
display = self._get_displayname_admin(client, token, user.user_id)
if display and not _needs_rename_display(display):
return False
return True
def _rename_synapse_user(
self,
client: httpx.Client,
token: str,
existing: set[str],
user_id: str,
) -> bool:
new_name = self._pick_guest_name(existing)
if not new_name:
return False
return self._set_displayname_admin(client, token, user_id, new_name)
def _rename_synapse_guests(
self,
client: httpx.Client,
token: str,
existing: set[str],
mas_usernames: set[str],
) -> SynapseGuestResult:
renamed = 0
pruned = 0
entries = self._synapse_entries(client, token)
now_ms = int(time.time() * 1000)
for entry in entries:
user_ref = self._synapse_user_id(entry)
if not user_ref:
continue
if self._maybe_prune_synapse_guest(client, token, user_ref.entry, user_ref.user_id, now_ms):
pruned += 1
continue
if not self._needs_synapse_rename(client, token, user_ref, mas_usernames):
continue
if self._rename_synapse_user(client, token, existing, user_ref.user_id):
renamed += 1
return SynapseGuestResult(renamed=renamed, pruned=pruned)
def run_guest_name_randomizer(self, wait: bool = True) -> dict[str, Any]:
"""Rename guest accounts in the guest room and related backing stores."""
self._validate_guest_name_settings()
with self._client() as client:
admin_token = self._mas_admin_token(client)
seeder_id = self._mas_user_id(client, admin_token, self._settings.comms_seeder_user)
seeder_token, seeder_session = self._mas_personal_session(client, admin_token, seeder_id)
try:
room_id, members, existing = self._room_context(client, seeder_token)
mas_result = self._rename_mas_guests(client, admin_token, room_id, members, existing)
synapse_result = self._rename_synapse_guests(
client,
seeder_token,
existing,
mas_result.usernames,
)
db_renamed = self._db_rename_numeric(existing)
finally:
self._mas_revoke_session(client, admin_token, seeder_session)
renamed = mas_result.renamed + synapse_result.renamed + db_renamed
pruned = synapse_result.pruned
skipped = mas_result.skipped
processed = renamed + pruned + skipped
summary = CommsSummary(processed, renamed, pruned, skipped)
logger.info(
"comms guest name sync finished",
extra={
"event": "comms_guest_name",
"status": "ok",
"processed": summary.processed,
"renamed": summary.renamed,
"pruned": summary.pruned,
"skipped": summary.skipped,
},
)
return {"status": "ok", **summary.__dict__}

View File

@ -0,0 +1,382 @@
"""Room lifecycle helpers for Matrix comms workflows."""
from __future__ import annotations
from typing import Any
import time
import urllib.parse
import httpx
from ..utils.logging import get_logger
from .comms_support import CommsServiceBase, _auth, _canon_user
logger = get_logger(__name__)
HTTP_OK = 200
HTTP_CREATED = 201
HTTP_ACCEPTED = 202
HTTP_NO_CONTENT = 204
HTTP_BAD_REQUEST = 400
HTTP_NOT_FOUND = 404
HTTP_CONFLICT = 409
class CommsRoomService(CommsServiceBase):
"""Pin, reset, and seed the room used by Ariadne comms tasks."""
def run_pin_invite(self, wait: bool = True) -> dict[str, Any]:
if not self._settings.comms_seeder_password:
raise RuntimeError("comms seeder password missing")
with self._client() as client:
token = self._login(client, self._settings.comms_seeder_user, self._settings.comms_seeder_password)
room_id = self._resolve_alias(client, token, self._settings.comms_room_alias)
pinned = self._get_pinned(client, token, room_id)
for event_id in pinned:
event = self._get_event(client, token, room_id, event_id)
if event and (event.get("content") or {}).get("body") == self._settings.comms_pin_message:
return {"status": "ok", "detail": "already pinned"}
event_id = self._send_message(client, token, room_id, self._settings.comms_pin_message)
if not event_id:
return {"status": "error", "detail": "pin event_id missing"}
self._pin_message(client, token, room_id, event_id)
return {"status": "ok", "detail": "pinned"}
def run_reset_room(self, wait: bool = True) -> dict[str, Any]:
if not self._settings.comms_seeder_password:
raise RuntimeError("comms seeder password missing")
with self._client() as client:
token = self._login_with_retry(client, self._settings.comms_seeder_user, self._settings.comms_seeder_password)
old_room_id = self._resolve_alias(client, token, self._settings.comms_room_alias)
new_room_id = self._create_room(client, token, self._settings.comms_room_name)
self._set_room_state(client, token, new_room_id, "m.room.join_rules", {"join_rule": "public"})
self._set_room_state(client, token, new_room_id, "m.room.guest_access", {"guest_access": "can_join"})
self._set_room_state(
client,
token,
new_room_id,
"m.room.history_visibility",
{"history_visibility": "shared"},
)
self._set_room_state(client, token, new_room_id, "m.room.power_levels", self._power_levels())
self._delete_alias(client, token, self._settings.comms_room_alias)
self._put_alias(client, token, self._settings.comms_room_alias, new_room_id)
self._set_room_state(
client,
token,
new_room_id,
"m.room.canonical_alias",
{"alias": self._settings.comms_room_alias},
)
self._set_directory_visibility(client, token, new_room_id, "public")
bot_user_id = _canon_user(self._settings.comms_bot_user, self._settings.comms_server_name)
self._invite_user(client, token, new_room_id, bot_user_id)
for uid in self._list_joined_members(client, token, old_room_id):
if uid == _canon_user(self._settings.comms_seeder_user, self._settings.comms_server_name):
continue
localpart = uid.split(":", 1)[0].lstrip("@")
if localpart.isdigit():
continue
self._invite_user(client, token, new_room_id, uid)
event_id = self._send_message(client, token, new_room_id, self._settings.comms_pin_message)
if not event_id:
raise RuntimeError("pin message event_id missing")
self._set_room_state(client, token, new_room_id, "m.room.pinned_events", {"pinned": [event_id]})
self._set_directory_visibility(client, token, old_room_id, "private")
self._set_room_state(client, token, old_room_id, "m.room.join_rules", {"join_rule": "invite"})
self._set_room_state(client, token, old_room_id, "m.room.guest_access", {"guest_access": "forbidden"})
self._set_room_state(
client,
token,
old_room_id,
"m.room.tombstone",
{
"body": "Othrys has been reset. Please join the new room.",
"replacement_room": new_room_id,
},
)
self._send_message(
client,
token,
old_room_id,
"Othrys was reset. Join the new room at https://live.bstein.dev/#/room/#othrys:live.bstein.dev?action=join",
)
return {"status": "ok", "detail": f"old_room_id={old_room_id} new_room_id={new_room_id}"}
def run_seed_room(self, wait: bool = True) -> dict[str, Any]:
if not self._settings.comms_seeder_password or not self._settings.comms_bot_password:
raise RuntimeError("comms seeder/bot password missing")
with self._client() as client:
token = self._login(client, self._settings.comms_seeder_user, self._settings.comms_seeder_password)
for user, password, admin in (
(self._settings.comms_seeder_user, self._settings.comms_seeder_password, True),
(self._settings.comms_bot_user, self._settings.comms_bot_password, False),
):
try:
self._ensure_user(client, token, user, password, admin)
except RuntimeError as exc:
message = str(exc)
if "You are not a server admin" in message:
logger.warning(
"comms seed room ensure skipped",
extra={"event": "comms_seed_room", "user": user, "detail": message},
)
continue
raise
room_id = self._ensure_room(client, token)
self._join_user(client, token, room_id, _canon_user(self._settings.comms_bot_user, self._settings.comms_server_name))
self._join_all_locals(client, token, room_id)
return {"status": "ok", "detail": "room seeded"}
def _login(self, client: httpx.Client, user: str, password: str) -> str:
resp = client.post(
f"{self._settings.comms_auth_base}/_matrix/client/v3/login",
json={
"type": "m.login.password",
"identifier": {"type": "m.id.user", "user": _canon_user(user, self._settings.comms_server_name)},
"password": password,
},
)
if resp.status_code != HTTP_OK:
raise RuntimeError(f"login failed: {resp.status_code} {resp.text}")
payload = resp.json()
token = payload.get("access_token")
if not isinstance(token, str) or not token:
raise RuntimeError("login missing token")
return token
def _login_with_retry(self, client: httpx.Client, user: str, password: str) -> str:
last: Exception | None = None
for attempt in range(1, 6):
try:
return self._login(client, user, password)
except Exception as exc: # noqa: BLE001
last = exc
time.sleep(attempt * 2)
raise RuntimeError(str(last) if last else "login failed")
def _get_pinned(self, client: httpx.Client, token: str, room_id: str) -> list[str]:
resp = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/m.room.pinned_events",
headers=_auth(token),
)
if resp.status_code == HTTP_NOT_FOUND:
return []
resp.raise_for_status()
pinned = resp.json().get("pinned", [])
return [item for item in pinned if isinstance(item, str)]
def _get_event(self, client: httpx.Client, token: str, room_id: str, event_id: str) -> dict[str, Any] | None:
resp = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/event/{urllib.parse.quote(event_id)}",
headers=_auth(token),
)
if resp.status_code == HTTP_NOT_FOUND:
return None
resp.raise_for_status()
return resp.json()
def _send_message(self, client: httpx.Client, token: str, room_id: str, body: str) -> str:
resp = client.post(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/send/m.room.message",
headers=_auth(token),
json={"msgtype": "m.text", "body": body},
)
resp.raise_for_status()
payload = resp.json()
event_id = payload.get("event_id")
return event_id if isinstance(event_id, str) else ""
def _pin_message(self, client: httpx.Client, token: str, room_id: str, event_id: str) -> None:
resp = client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/m.room.pinned_events",
headers=_auth(token),
json={"pinned": [event_id]},
)
resp.raise_for_status()
def _create_room(self, client: httpx.Client, token: str, name: str) -> str:
resp = client.post(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/createRoom",
headers=_auth(token),
json={"preset": "public_chat", "name": name, "room_version": "11"},
)
resp.raise_for_status()
return resp.json()["room_id"]
def _set_room_state(self, client: httpx.Client, token: str, room_id: str, ev_type: str, content: dict[str, Any]) -> None:
resp = client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/state/{ev_type}",
headers=_auth(token),
json=content,
)
resp.raise_for_status()
def _set_directory_visibility(self, client: httpx.Client, token: str, room_id: str, visibility: str) -> None:
resp = client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/list/room/{urllib.parse.quote(room_id)}",
headers=_auth(token),
json={"visibility": visibility},
)
resp.raise_for_status()
def _delete_alias(self, client: httpx.Client, token: str, alias: str) -> None:
resp = client.delete(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/room/{urllib.parse.quote(alias)}",
headers=_auth(token),
)
if resp.status_code in (HTTP_OK, HTTP_ACCEPTED, HTTP_NOT_FOUND):
return
resp.raise_for_status()
def _put_alias(self, client: httpx.Client, token: str, alias: str, room_id: str) -> None:
resp = client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/room/{urllib.parse.quote(alias)}",
headers=_auth(token),
json={"room_id": room_id},
)
resp.raise_for_status()
def _list_joined_members(self, client: httpx.Client, token: str, room_id: str) -> list[str]:
resp = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/members?membership=join",
headers=_auth(token),
)
resp.raise_for_status()
members = []
for ev in resp.json().get("chunk", []) or []:
if ev.get("type") != "m.room.member":
continue
uid = ev.get("state_key")
if isinstance(uid, str) and uid.startswith("@"):
members.append(uid)
return members
def _invite_user(self, client: httpx.Client, token: str, room_id: str, user_id: str) -> None:
resp = client.post(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/invite",
headers=_auth(token),
json={"user_id": user_id},
)
if resp.status_code in (HTTP_OK, HTTP_ACCEPTED):
return
resp.raise_for_status()
def _power_levels(self) -> dict[str, Any]:
return {
"ban": 50,
"events": {
"m.room.avatar": 50,
"m.room.canonical_alias": 50,
"m.room.encryption": 100,
"m.room.history_visibility": 100,
"m.room.name": 50,
"m.room.power_levels": 100,
"m.room.server_acl": 100,
"m.room.tombstone": 100,
},
"events_default": 0,
"historical": 100,
"invite": 50,
"kick": 50,
"m.call.invite": 50,
"redact": 50,
"state_default": 50,
"users": { _canon_user(self._settings.comms_seeder_user, self._settings.comms_server_name): 100 },
"users_default": 0,
}
def _ensure_user(self, client: httpx.Client, token: str, localpart: str, password: str, admin: bool) -> None:
admin_token = self._admin_token(token)
user_id = _canon_user(localpart, self._settings.comms_server_name)
url = f"{self._settings.comms_synapse_base}/_synapse/admin/v2/users/{urllib.parse.quote(user_id)}"
resp = client.get(url, headers=_auth(admin_token))
if resp.status_code == HTTP_OK:
return
payload = {"password": password, "admin": admin, "deactivated": False}
create = client.put(url, headers=_auth(admin_token), json=payload)
if create.status_code not in (HTTP_OK, HTTP_CREATED):
raise RuntimeError(f"create user {user_id} failed: {create.status_code} {create.text}")
def _ensure_room(self, client: httpx.Client, token: str) -> str:
alias = self._settings.comms_room_alias
alias_enc = urllib.parse.quote(alias)
exists = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/room/{alias_enc}",
headers=_auth(token),
)
if exists.status_code == HTTP_OK:
room_id = exists.json()["room_id"]
else:
create = client.post(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/createRoom",
headers=_auth(token),
json={
"preset": "public_chat",
"name": self._settings.comms_room_name,
"room_alias_name": alias.split(":", 1)[0].lstrip("#"),
"initial_state": [],
"power_level_content_override": {
"events_default": 0,
"users_default": 0,
"state_default": 50,
},
},
)
if create.status_code not in (HTTP_OK, HTTP_CONFLICT):
raise RuntimeError(f"create room failed: {create.status_code} {create.text}")
exists = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/room/{alias_enc}",
headers=_auth(token),
)
room_id = exists.json()["room_id"]
state_events = [
("m.room.join_rules", {"join_rule": "public"}),
("m.room.guest_access", {"guest_access": "can_join"}),
("m.room.history_visibility", {"history_visibility": "shared"}),
("m.room.canonical_alias", {"alias": alias}),
]
for ev_type, content in state_events:
client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{room_id}/state/{ev_type}",
headers=_auth(token),
json=content,
)
client.put(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/list/room/{room_id}",
headers=_auth(token),
json={"visibility": "public"},
)
return room_id
def _join_user(self, client: httpx.Client, token: str, room_id: str, user_id: str) -> None:
admin_token = self._admin_token(token)
client.post(
f"{self._settings.comms_synapse_base}/_synapse/admin/v1/join/{urllib.parse.quote(room_id)}",
headers=_auth(admin_token),
json={"user_id": user_id},
)
def _join_all_locals(self, client: httpx.Client, token: str, room_id: str) -> None:
users: list[str] = []
from_token = None
admin_token = self._admin_token(token)
while True:
url = f"{self._settings.comms_synapse_base}/_synapse/admin/v2/users?local=true&deactivated=false&limit=100"
if from_token:
url += f"&from={from_token}"
resp = client.get(url, headers=_auth(admin_token))
payload = resp.json()
users.extend([u["name"] for u in payload.get("users", []) if isinstance(u, dict) and u.get("name")])
from_token = payload.get("next_token")
if not from_token:
break
for uid in users:
self._join_user(client, token, room_id, uid)

View File

@ -0,0 +1,129 @@
"""Shared types and helpers for Matrix comms services."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
import httpx
import urllib.parse
@dataclass(frozen=True)
class CommsSummary:
"""Summarize a comms workflow.
WHY: the API returns compact JSON status objects that callers and tests
inspect directly.
"""
processed: int
renamed: int
pruned: int
skipped: int
detail: str = ""
@dataclass(frozen=True)
class MasGuestResult:
"""Result for the MAS guest rename pass."""
renamed: int
skipped: int
usernames: set[str]
@dataclass(frozen=True)
class SynapseGuestResult:
"""Result for the Synapse guest rename pass."""
renamed: int
pruned: int
@dataclass(frozen=True)
class DisplayNameTarget:
"""Describe where a display name update should land."""
room_id: str
user_id: str
name: str
in_room: bool
@dataclass(frozen=True)
class SynapseUserRef:
"""Reference to a Synapse user entry and its normalized localpart."""
entry: dict[str, Any]
user_id: str
localpart: str
class CommsServiceBase:
"""Common client/settings plumbing shared by the comms helpers."""
def __init__(self, client_factory: type[httpx.Client], settings: Any) -> None:
self._client_factory = client_factory
self._settings = settings
def _client(self) -> httpx.Client:
"""Create an HTTP client with the configured timeout."""
return self._client_factory(timeout=self._settings.comms_timeout_sec)
def _admin_token(self, fallback: str) -> str:
"""Return the Synapse admin token, falling back to the user token."""
token = getattr(self._settings, "comms_synapse_admin_token", "")
return token if token else fallback
def _resolve_alias(self, client: httpx.Client, token: str, alias: str) -> str:
"""Resolve a Matrix room alias to a room id."""
resp = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/directory/room/{urllib.parse.quote(alias)}",
headers=_auth(token),
)
resp.raise_for_status()
return resp.json()["room_id"]
def _room_members(self, client: httpx.Client, token: str, room_id: str) -> tuple[set[str], set[str]]:
"""Load joined members and display-name inventory for a room."""
resp = client.get(
f"{self._settings.comms_synapse_base}/_matrix/client/v3/rooms/{urllib.parse.quote(room_id)}/members",
headers=_auth(token),
)
resp.raise_for_status()
payload = resp.json()
members: set[str] = set()
existing: set[str] = set()
for ev in payload.get("chunk", []) or []:
user_id = ev.get("state_key")
if isinstance(user_id, str) and user_id:
members.add(user_id)
display = (ev.get("content") or {}).get("displayname")
if isinstance(display, str) and display:
existing.add(display)
return members, existing
def _auth(token: str) -> dict[str, str]:
return {"Authorization": f"Bearer {token}"}
def _canon_user(user: str, server_name: str) -> str:
user = (user or "").strip()
if user.startswith("@") and ":" in user:
return user
user = user.lstrip("@")
if ":" in user:
return f"@{user}"
return f"@{user}:{server_name}"
def _needs_rename_username(username: str) -> bool:
return username.isdigit() or username.startswith("guest-")
def _needs_rename_display(display: str | None) -> bool:
if not display:
return True
return display.isdigit() or display.startswith("guest-")

View File

@ -0,0 +1,524 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Any
from prometheus_client import Counter, Gauge
from ..k8s.client import delete_json, get_json
from ..settings import settings
from ..utils.logging import get_logger
logger = get_logger(__name__)
JENKINS_WORKSPACE_CLEANUP_RUNS_TOTAL = Counter(
"ariadne_jenkins_workspace_cleanup_runs_total",
"Jenkins workspace cleanup runs by status and mode",
["status", "mode"],
)
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL = Counter(
"ariadne_jenkins_workspace_cleanup_objects_total",
"Jenkins workspace cleanup objects by kind, action, and mode",
["kind", "action", "mode"],
)
JENKINS_WORKSPACE_CLEANUP_LAST_RUN_TS = Gauge(
"ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds",
"Last Jenkins workspace cleanup run timestamp",
)
JENKINS_WORKSPACE_CLEANUP_LAST_SUCCESS_TS = Gauge(
"ariadne_jenkins_workspace_cleanup_last_success_timestamp_seconds",
"Last successful Jenkins workspace cleanup timestamp",
)
JENKINS_WORKSPACE_CLEANUP_LAST_FAILURE_TS = Gauge(
"ariadne_jenkins_workspace_cleanup_last_failure_timestamp_seconds",
"Last failed Jenkins workspace cleanup timestamp",
)
JENKINS_WORKSPACE_CLEANUP_LAST_DELETED = Gauge(
"ariadne_jenkins_workspace_cleanup_last_deleted_total",
"Last Jenkins workspace cleanup deleted object count",
["kind"],
)
JENKINS_WORKSPACE_CLEANUP_LAST_PLANNED = Gauge(
"ariadne_jenkins_workspace_cleanup_last_planned_total",
"Last Jenkins workspace cleanup planned object count",
["kind"],
)
JENKINS_WORKSPACE_CLEANUP_LAST_SKIPPED = Gauge(
"ariadne_jenkins_workspace_cleanup_last_skipped_total",
"Last Jenkins workspace cleanup skipped object count",
)
JENKINS_WORKSPACE_CLEANUP_LAST_FAILURES = Gauge(
"ariadne_jenkins_workspace_cleanup_last_failures_total",
"Last Jenkins workspace cleanup failure count",
)
@dataclass(frozen=True)
class JenkinsWorkspaceCleanupSummary:
"""Summarize one Jenkins workspace-storage cleanup pass.
Inputs: Kubernetes PV/PVC/Longhorn objects fetched from the API server.
Outputs: deterministic counters for operator logs and metrics.
"""
pvs_planned: int
pvcs_planned: int
volumes_planned: int
pvs_deleted: int
pvcs_deleted: int
volumes_deleted: int
skipped: int
failures: int
dry_run: bool
@property
def planned(self) -> int:
return self.pvs_planned + self.pvcs_planned + self.volumes_planned
@property
def deleted(self) -> int:
return self.pvs_deleted + self.pvcs_deleted + self.volumes_deleted
@dataclass(frozen=True)
class _CleanupCandidate:
name: str
kind: str
path: str
created_at: datetime | None
related_pvc: str | None = None
pv_name: str | None = None
def _parse_timestamp(raw: str) -> datetime | None:
"""Parse Kubernetes RFC3339 timestamps into timezone-aware datetimes."""
normalized = raw.replace("Z", "+00:00")
try:
return datetime.fromisoformat(normalized)
except ValueError:
return None
def _created_at(metadata: dict[str, Any]) -> datetime | None:
raw = metadata.get("creationTimestamp")
if not isinstance(raw, str) or not raw:
return None
return _parse_timestamp(raw)
def _is_old_enough(metadata: dict[str, Any]) -> bool:
"""Return true when an object age exceeds the configured cleanup threshold."""
created_at = _created_at(metadata)
if created_at is None:
return False
min_age = timedelta(hours=settings.jenkins_workspace_cleanup_min_age_hours)
return datetime.now(timezone.utc) - created_at >= min_age
def _is_deleting(metadata: dict[str, Any]) -> bool:
deletion_ts = metadata.get("deletionTimestamp")
return isinstance(deletion_ts, str) and bool(deletion_ts.strip())
def _is_workspace_name(name: Any) -> bool:
return isinstance(name, str) and name.startswith(settings.jenkins_workspace_pvc_prefix)
def _active_workspace_claims() -> set[str]:
"""Collect currently referenced Jenkins workspace PVC names from pods."""
namespace = settings.jenkins_workspace_namespace
payload = get_json(f"/api/v1/namespaces/{namespace}/pods")
items = payload.get("items") if isinstance(payload.get("items"), list) else []
active: set[str] = set()
for pod in items:
if not isinstance(pod, dict):
continue
metadata = pod.get("metadata") if isinstance(pod.get("metadata"), dict) else {}
annotations = metadata.get("annotations") if isinstance(metadata.get("annotations"), dict) else {}
spec = pod.get("spec") if isinstance(pod.get("spec"), dict) else {}
volumes = spec.get("volumes") if isinstance(spec.get("volumes"), list) else []
for volume in volumes:
if not isinstance(volume, dict):
continue
claim = volume.get("persistentVolumeClaim")
if not isinstance(claim, dict):
continue
claim_name = claim.get("claimName")
if _is_workspace_name(claim_name):
active.add(claim_name)
claim_name = annotations.get("jenkins.io/workspace-pvc")
if _is_workspace_name(claim_name):
active.add(claim_name)
return active
def _workspace_pv_candidates(active_claims: set[str]) -> tuple[list[_CleanupCandidate], set[str]]:
"""Find releasable Jenkins workspace PVs and keep a set of all PV names."""
namespace = settings.jenkins_workspace_namespace
payload = get_json("/api/v1/persistentvolumes")
items = payload.get("items") if isinstance(payload.get("items"), list) else []
candidates: list[_CleanupCandidate] = []
all_pv_names: set[str] = set()
for pv in items:
if not isinstance(pv, dict):
continue
metadata = pv.get("metadata") if isinstance(pv.get("metadata"), dict) else {}
status = pv.get("status") if isinstance(pv.get("status"), dict) else {}
spec = pv.get("spec") if isinstance(pv.get("spec"), dict) else {}
name = metadata.get("name")
if isinstance(name, str) and name:
all_pv_names.add(name)
claim_ref = spec.get("claimRef") if isinstance(spec.get("claimRef"), dict) else {}
claim_namespace = claim_ref.get("namespace")
claim_name = claim_ref.get("name")
phase = status.get("phase")
if claim_namespace != namespace:
continue
if not _is_workspace_name(claim_name):
continue
if _is_deleting(metadata):
continue
if claim_name in active_claims:
continue
if phase not in {"Released", "Failed"}:
continue
if not _is_old_enough(metadata):
continue
if not isinstance(name, str) or not name:
continue
candidates.append(
_CleanupCandidate(
name=name,
kind="pv",
path=f"/api/v1/persistentvolumes/{name}",
created_at=_created_at(metadata),
related_pvc=claim_name if isinstance(claim_name, str) else None,
)
)
return candidates, all_pv_names
def _workspace_pvc_candidates(active_claims: set[str]) -> list[_CleanupCandidate]:
"""Find stale Jenkins workspace PVCs that are not actively referenced."""
namespace = settings.jenkins_workspace_namespace
payload = get_json(f"/api/v1/namespaces/{namespace}/persistentvolumeclaims")
items = payload.get("items") if isinstance(payload.get("items"), list) else []
candidates: list[_CleanupCandidate] = []
for pvc in items:
if not isinstance(pvc, dict):
continue
metadata = pvc.get("metadata") if isinstance(pvc.get("metadata"), dict) else {}
status = pvc.get("status") if isinstance(pvc.get("status"), dict) else {}
claim_name = metadata.get("name")
phase = status.get("phase")
if not _is_workspace_name(claim_name):
continue
if _is_deleting(metadata):
continue
if claim_name in active_claims:
continue
if phase == "Bound":
continue
if not _is_old_enough(metadata):
continue
if not isinstance(claim_name, str) or not claim_name:
continue
candidates.append(
_CleanupCandidate(
name=claim_name,
kind="pvc",
path=f"/api/v1/namespaces/{namespace}/persistentvolumeclaims/{claim_name}",
created_at=_created_at(metadata),
)
)
return candidates
def _workspace_longhorn_candidates(all_pv_names: set[str], removed_pv_names: set[str]) -> list[_CleanupCandidate]:
namespace = "longhorn-system"
payload = get_json("/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes")
items = payload.get("items") if isinstance(payload.get("items"), list) else []
candidates: list[_CleanupCandidate] = []
for volume in items:
if not isinstance(volume, dict):
continue
metadata = volume.get("metadata") if isinstance(volume.get("metadata"), dict) else {}
status = volume.get("status") if isinstance(volume.get("status"), dict) else {}
spec = volume.get("spec") if isinstance(volume.get("spec"), dict) else {}
name = metadata.get("name")
if not isinstance(name, str) or not name:
continue
labels = metadata.get("labels") if isinstance(metadata.get("labels"), dict) else {}
pvc_name = labels.get("kubernetes.io/created-for/pvc/name")
robust_state = status.get("robustness")
state = status.get("state")
attached = status.get("isAttached")
frontend = spec.get("frontend")
should_delete = False
if name in removed_pv_names:
should_delete = True
elif _is_workspace_name(pvc_name) and name not in all_pv_names:
should_delete = True
if not should_delete:
continue
if _is_deleting(metadata):
continue
if not _is_old_enough(metadata):
continue
if state not in {None, "detached", "faulted", "unknown"}:
continue
if attached is True:
continue
if robust_state not in {None, "unknown", "faulted", "degraded"}:
continue
if frontend not in {None, "", "blockdev"}:
continue
candidates.append(
_CleanupCandidate(
name=name,
kind="longhorn_volume",
path=f"/apis/longhorn.io/v1beta2/namespaces/{namespace}/volumes/{name}",
created_at=_created_at(metadata),
pv_name=name,
)
)
return candidates
def _record_metrics(summary: JenkinsWorkspaceCleanupSummary) -> None:
mode = "dry_run" if summary.dry_run else "delete"
status = "ok" if summary.failures == 0 else "error"
JENKINS_WORKSPACE_CLEANUP_RUNS_TOTAL.labels(status=status, mode=mode).inc()
if summary.failures:
JENKINS_WORKSPACE_CLEANUP_LAST_FAILURE_TS.set(datetime.now(timezone.utc).timestamp())
else:
JENKINS_WORKSPACE_CLEANUP_LAST_SUCCESS_TS.set(datetime.now(timezone.utc).timestamp())
JENKINS_WORKSPACE_CLEANUP_LAST_RUN_TS.set(datetime.now(timezone.utc).timestamp())
JENKINS_WORKSPACE_CLEANUP_LAST_DELETED.labels(kind="pvc").set(summary.pvcs_deleted)
JENKINS_WORKSPACE_CLEANUP_LAST_DELETED.labels(kind="pv").set(summary.pvs_deleted)
JENKINS_WORKSPACE_CLEANUP_LAST_DELETED.labels(kind="longhorn_volume").set(summary.volumes_deleted)
JENKINS_WORKSPACE_CLEANUP_LAST_PLANNED.labels(kind="pvc").set(summary.pvcs_planned)
JENKINS_WORKSPACE_CLEANUP_LAST_PLANNED.labels(kind="pv").set(summary.pvs_planned)
JENKINS_WORKSPACE_CLEANUP_LAST_PLANNED.labels(kind="longhorn_volume").set(summary.volumes_planned)
JENKINS_WORKSPACE_CLEANUP_LAST_SKIPPED.set(summary.skipped)
JENKINS_WORKSPACE_CLEANUP_LAST_FAILURES.set(summary.failures)
for kind, planned, deleted in (
("pvc", summary.pvcs_planned, summary.pvcs_deleted),
("pv", summary.pvs_planned, summary.pvs_deleted),
("longhorn_volume", summary.volumes_planned, summary.volumes_deleted),
):
if planned:
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL.labels(kind=kind, action="planned", mode=mode).inc(planned)
if deleted:
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL.labels(kind=kind, action="deleted", mode=mode).inc(deleted)
if summary.skipped:
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL.labels(
kind="cleanup",
action="skipped",
mode=mode,
).inc(summary.skipped)
if summary.failures:
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL.labels(
kind="cleanup",
action="failed",
mode=mode,
).inc(summary.failures)
def cleanup_jenkins_workspace_storage() -> JenkinsWorkspaceCleanupSummary:
"""Delete stale Jenkins workspace PVC/PV artifacts and orphan Longhorn volumes."""
namespace = settings.jenkins_workspace_namespace
dry_run = settings.jenkins_workspace_cleanup_dry_run
max_deletions = settings.jenkins_workspace_cleanup_max_deletions_per_run
prefix = settings.jenkins_workspace_pvc_prefix.strip()
pvs_deleted = 0
pvcs_deleted = 0
volumes_deleted = 0
skipped = 0
failures = 0
stale_pvs: list[_CleanupCandidate] = []
stale_pvcs: list[_CleanupCandidate] = []
stale_volumes: list[_CleanupCandidate] = []
summary: JenkinsWorkspaceCleanupSummary
try:
if not namespace.strip():
raise ValueError("jenkins workspace cleanup namespace is empty")
if not prefix:
raise ValueError("jenkins workspace cleanup pvc prefix is empty")
if settings.jenkins_workspace_cleanup_min_age_hours < 1.0:
raise ValueError("jenkins workspace cleanup min age must be >= 1 hour")
if max_deletions < 1:
raise ValueError("jenkins workspace cleanup max deletions must be >= 1")
active_claims = _active_workspace_claims()
stale_pvs, all_pv_names = _workspace_pv_candidates(active_claims)
stale_pvcs = _workspace_pvc_candidates(active_claims)
removed_pv_names: set[str] = set()
stale_volumes = _workspace_longhorn_candidates(all_pv_names, removed_pv_names)
planned_total = len(stale_pvs) + len(stale_pvcs) + len(stale_volumes)
deletion_budget: int | None = None
if dry_run:
logger.info(
"jenkins workspace cleanup dry-run enabled",
extra={
"event": "jenkins_workspace_cleanup",
"status": "dry_run",
"namespace": namespace,
"dry_run": True,
"planned_pvs": len(stale_pvs),
"planned_pvcs": len(stale_pvcs),
"planned_volumes": len(stale_volumes),
"max_deletions": max_deletions,
},
)
else:
deletion_budget = max_deletions
if not dry_run and planned_total > max_deletions:
logger.warning(
"jenkins workspace cleanup capped by max deletions guard",
extra={
"event": "jenkins_workspace_cleanup",
"status": "guard_capped",
"namespace": namespace,
"dry_run": False,
"planned_total": planned_total,
"max_deletions": max_deletions,
"planned_pvs": len(stale_pvs),
"planned_pvcs": len(stale_pvcs),
"planned_volumes": len(stale_volumes),
},
)
for pvc in stale_pvcs:
claim_name = pvc.name
if not claim_name:
skipped += 1
continue
if dry_run:
continue
if deletion_budget is not None and deletion_budget <= 0:
skipped += 1
continue
if deletion_budget is not None:
deletion_budget -= 1
try:
delete_json(pvc.path)
pvcs_deleted += 1
except Exception as exc:
failures += 1
logger.info(
"jenkins workspace pvc delete failed",
extra={"event": "jenkins_workspace_cleanup", "claim": claim_name, "detail": str(exc)},
)
for pv in stale_pvs:
pv_name = pv.name
if not pv_name:
skipped += 1
continue
if dry_run:
continue
if deletion_budget is not None and deletion_budget <= 0:
skipped += 1
continue
if deletion_budget is not None:
deletion_budget -= 1
try:
delete_json(pv.path)
removed_pv_names.add(pv_name)
pvs_deleted += 1
except Exception as exc:
failures += 1
logger.info(
"jenkins workspace pv delete failed",
extra={"event": "jenkins_workspace_cleanup", "pv": pv_name, "detail": str(exc)},
)
# Recompute longhorn candidates using the updated removed PV list.
stale_volumes = _workspace_longhorn_candidates(all_pv_names, removed_pv_names)
for volume in stale_volumes:
if not volume.name:
skipped += 1
continue
if dry_run:
continue
if deletion_budget is not None and deletion_budget <= 0:
skipped += 1
continue
if deletion_budget is not None:
deletion_budget -= 1
try:
delete_json(volume.path)
volumes_deleted += 1
except Exception as exc:
failures += 1
logger.info(
"jenkins workspace longhorn volume delete failed",
extra={"event": "jenkins_workspace_cleanup", "volume": volume.name, "detail": str(exc)},
)
summary = JenkinsWorkspaceCleanupSummary(
pvs_planned=len(stale_pvs),
pvcs_planned=len(stale_pvcs),
volumes_planned=len(stale_volumes),
pvs_deleted=pvs_deleted,
pvcs_deleted=pvcs_deleted,
volumes_deleted=volumes_deleted,
skipped=skipped,
failures=failures,
dry_run=dry_run,
)
except Exception as exc:
failures += 1
logger.exception(
"jenkins workspace cleanup failed",
extra={"event": "jenkins_workspace_cleanup", "status": "error", "namespace": namespace, "detail": str(exc)},
)
summary = JenkinsWorkspaceCleanupSummary(
pvs_planned=len(stale_pvs),
pvcs_planned=len(stale_pvcs),
volumes_planned=len(stale_volumes),
pvs_deleted=pvs_deleted,
pvcs_deleted=pvcs_deleted,
volumes_deleted=volumes_deleted,
skipped=skipped,
failures=failures,
dry_run=dry_run,
)
_record_metrics(summary)
raise
_record_metrics(summary)
logger.info(
"jenkins workspace cleanup finished",
extra={
"event": "jenkins_workspace_cleanup",
"status": "ok" if failures == 0 else "error",
"dry_run": dry_run,
"namespace": namespace,
"planned_pvs": summary.pvs_planned,
"planned_pvcs": summary.pvcs_planned,
"planned_volumes": summary.volumes_planned,
"deleted_pvs": pvs_deleted,
"deleted_pvcs": pvcs_deleted,
"deleted_volumes": volumes_deleted,
"skipped": skipped,
"failures": failures,
},
)
return summary

View File

@ -168,6 +168,11 @@ class Settings:
platform_quality_probe_wait_timeout_sec: float platform_quality_probe_wait_timeout_sec: float
platform_quality_probe_pushgateway_url: str platform_quality_probe_pushgateway_url: str
platform_quality_probe_http_timeout_sec: int platform_quality_probe_http_timeout_sec: int
jenkins_workspace_namespace: str
jenkins_workspace_pvc_prefix: str
jenkins_workspace_cleanup_min_age_hours: float
jenkins_workspace_cleanup_dry_run: bool
jenkins_workspace_cleanup_max_deletions_per_run: int
vaultwarden_namespace: str vaultwarden_namespace: str
vaultwarden_pod_label: str vaultwarden_pod_label: str
@ -234,6 +239,7 @@ class Settings:
metis_token_sync_vault_k8s_role: str metis_token_sync_vault_k8s_role: str
metis_k3s_token_sync_cron: str metis_k3s_token_sync_cron: str
platform_quality_suite_probe_cron: str platform_quality_suite_probe_cron: str
jenkins_workspace_cleanup_cron: str
opensearch_url: str opensearch_url: str
opensearch_limit_bytes: int opensearch_limit_bytes: int
@ -459,6 +465,19 @@ class Settings:
"platform_quality_probe_http_timeout_sec": _env_int("PLATFORM_QUALITY_PROBE_HTTP_TIMEOUT_SECONDS", 12), "platform_quality_probe_http_timeout_sec": _env_int("PLATFORM_QUALITY_PROBE_HTTP_TIMEOUT_SECONDS", 12),
} }
@classmethod
def _jenkins_workspace_cleanup_config(cls) -> dict[str, Any]:
return {
"jenkins_workspace_namespace": _env("JENKINS_WORKSPACE_NAMESPACE", "jenkins"),
"jenkins_workspace_pvc_prefix": _env("JENKINS_WORKSPACE_PVC_PREFIX", "pvc-workspace-"),
"jenkins_workspace_cleanup_min_age_hours": _env_float("JENKINS_WORKSPACE_CLEANUP_MIN_AGE_HOURS", 12.0),
"jenkins_workspace_cleanup_dry_run": _env_bool("JENKINS_WORKSPACE_CLEANUP_DRY_RUN", "false"),
"jenkins_workspace_cleanup_max_deletions_per_run": _env_int(
"JENKINS_WORKSPACE_CLEANUP_MAX_DELETIONS_PER_RUN",
20,
),
}
@classmethod @classmethod
def _vaultwarden_config(cls) -> dict[str, Any]: def _vaultwarden_config(cls) -> dict[str, Any]:
return { return {
@ -505,6 +524,10 @@ class Settings:
"ARIADNE_SCHEDULE_PLATFORM_QUALITY_SUITE_PROBE", "ARIADNE_SCHEDULE_PLATFORM_QUALITY_SUITE_PROBE",
"*/15 * * * *", "*/15 * * * *",
), ),
"jenkins_workspace_cleanup_cron": _env(
"ARIADNE_SCHEDULE_JENKINS_WORKSPACE_CLEANUP",
"45 */6 * * *",
),
} }
@classmethod @classmethod
@ -565,6 +588,7 @@ class Settings:
comms_cfg = cls._comms_config() comms_cfg = cls._comms_config()
image_cfg = cls._image_sweeper_config() image_cfg = cls._image_sweeper_config()
platform_quality_probe_cfg = cls._platform_quality_probe_config() platform_quality_probe_cfg = cls._platform_quality_probe_config()
jenkins_workspace_cleanup_cfg = cls._jenkins_workspace_cleanup_config()
vaultwarden_cfg = cls._vaultwarden_config() vaultwarden_cfg = cls._vaultwarden_config()
schedule_cfg = cls._schedule_config() schedule_cfg = cls._schedule_config()
cluster_cfg = cls._cluster_state_config() cluster_cfg = cls._cluster_state_config()
@ -605,6 +629,7 @@ class Settings:
**comms_cfg, **comms_cfg,
**image_cfg, **image_cfg,
**platform_quality_probe_cfg, **platform_quality_probe_cfg,
**jenkins_workspace_cleanup_cfg,
**vaultwarden_cfg, **vaultwarden_cfg,
**schedule_cfg, **schedule_cfg,
**cluster_cfg, **cluster_cfg,

23
ci/coverage_contract.json Normal file
View File

@ -0,0 +1,23 @@
{
"phase": "phase-1",
"files": {
"ariadne/services/cluster_state.py": 88,
"ariadne/services/cluster_state_impl/analysis.py": 55,
"ariadne/services/cluster_state_impl/attention.py": 79,
"ariadne/services/cluster_state_impl/common.py": 64,
"ariadne/services/cluster_state_impl/context.py": 68,
"ariadne/services/cluster_state_impl/inventory.py": 81,
"ariadne/services/cluster_state_impl/k8s.py": 76,
"ariadne/services/cluster_state_impl/metrics.py": 42,
"ariadne/services/cluster_state_impl/metrics_extra.py": 87,
"ariadne/services/cluster_state_impl/nodes.py": 24,
"ariadne/services/cluster_state_impl/pods.py": 67,
"ariadne/services/cluster_state_impl/profiles.py": 70,
"ariadne/services/cluster_state_impl/trends.py": 73,
"ariadne/services/cluster_state_impl/vm.py": 48,
"ariadne/services/comms.py": 95,
"ariadne/services/comms_guest.py": 65,
"ariadne/services/comms_room.py": 90,
"ariadne/services/comms_support.py": 88
}
}

View File

@ -1,2 +1,3 @@
[pytest] [pytest]
pythonpath = . pythonpath = .
testpaths = testing

View File

@ -0,0 +1,76 @@
#!/usr/bin/env python3
"""Validate per-file coverage against an explicit contract.
Why: the repo is ratcheting from an overall coverage floor toward per-file
coverage guarantees, and CI needs a machine-checkable list of the files that are
in phase 1 of that ratchet.
Inputs:
- `coverage_json`: slipcover JSON artifact with per-file summaries.
- `contract_json`: JSON object containing a `files` map of file paths to
minimum coverage percentages.
Output:
- Exit 0 when every contracted file meets or exceeds its minimum.
- Exit 1 with a concise report when any file falls short or is missing.
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Any
def _load_json(path: Path) -> dict[str, Any]:
"""Load a JSON file and return the parsed object."""
return json.loads(path.read_text(encoding="utf-8"))
def _file_percent(entry: dict[str, Any]) -> float | None:
summary = entry.get("summary") if isinstance(entry.get("summary"), dict) else {}
percent = summary.get("percent_covered")
return float(percent) if isinstance(percent, (int, float)) else None
def check_coverage_contract(coverage_json: Path, contract_json: Path) -> int:
"""Check the supplied coverage artifact against the contract file."""
coverage = _load_json(coverage_json)
contract = _load_json(contract_json)
files = coverage.get("files") if isinstance(coverage.get("files"), dict) else {}
contracted = contract.get("files") if isinstance(contract.get("files"), dict) else {}
failures: list[str] = []
for path, minimum in sorted(contracted.items()):
entry = files.get(path)
if not isinstance(entry, dict):
failures.append(f"{path}: missing from coverage report (minimum {minimum}%)")
continue
percent = _file_percent(entry)
if percent is None:
failures.append(f"{path}: no per-file percentage in coverage report")
continue
if percent < float(minimum):
failures.append(f"{path}: {percent:.1f}% < {float(minimum):.1f}%")
if failures:
print("Coverage contract failed:")
for failure in failures:
print(f"- {failure}")
return 1
print(f"Coverage contract passed for {len(contracted)} files.")
return 0
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("coverage_json", type=Path)
parser.add_argument("contract_json", type=Path)
args = parser.parse_args()
return check_coverage_contract(args.coverage_json, args.contract_json)
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -9,6 +9,9 @@ import sys
import urllib.request import urllib.request
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
HTTP_BAD_REQUEST = 400
MIN_METRIC_FIELDS = 2
def _escape_label(value: str) -> str: def _escape_label(value: str) -> str:
return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"') return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
@ -73,7 +76,7 @@ def _post_text(url: str, payload: str) -> None:
headers={"Content-Type": "text/plain"}, headers={"Content-Type": "text/plain"},
) )
with urllib.request.urlopen(req, timeout=10) as resp: with urllib.request.urlopen(req, timeout=10) as resp:
if resp.status >= 400: if resp.status >= HTTP_BAD_REQUEST:
raise RuntimeError(f"metrics push failed status={resp.status}") raise RuntimeError(f"metrics push failed status={resp.status}")
@ -88,7 +91,7 @@ def _fetch_existing_counter(pushgateway_url: str, metric: str, labels: dict[str,
if any(f'{k}="{v}"' not in line for k, v in labels.items()): if any(f'{k}="{v}"' not in line for k, v in labels.items()):
continue continue
parts = line.split() parts = line.split()
if len(parts) < 2: if len(parts) < MIN_METRIC_FIELDS:
continue continue
try: try:
return float(parts[1]) return float(parts[1])

1
testing/__init__.py Normal file
View File

@ -0,0 +1 @@
"""Ariadne test suite package."""

View File

@ -64,6 +64,7 @@ def test_startup_registers_metis_watch(monkeypatch) -> None:
assert any(name == "schedule.metis_sentinel_watch" for name, _cron in tasks) assert any(name == "schedule.metis_sentinel_watch" for name, _cron in tasks)
assert any(name == "schedule.metis_k3s_token_sync" for name, _cron in tasks) assert any(name == "schedule.metis_k3s_token_sync" for name, _cron in tasks)
assert any(name == "schedule.platform_quality_suite_probe" for name, _cron in tasks) assert any(name == "schedule.platform_quality_suite_probe" for name, _cron in tasks)
assert any(name == "schedule.jenkins_workspace_cleanup" for name, _cron in tasks)
def test_record_event_handles_exception(monkeypatch) -> None: def test_record_event_handles_exception(monkeypatch) -> None:

View File

@ -20,7 +20,7 @@ def test_keycloak_verify_accepts_matching_audience(monkeypatch) -> None:
kc = KeycloakOIDC("https://jwks", "https://issuer", "portal") kc = KeycloakOIDC("https://jwks", "https://issuer", "portal")
monkeypatch.setattr(kc, "_get_jwks", lambda force=False: {"keys": [{"kid": "test"}]}) monkeypatch.setattr(kc, "_get_jwks", lambda force=False: {"keys": [{"kid": "test"}]})
monkeypatch.setattr(jwt.algorithms.RSAAlgorithm, "from_jwk", lambda key: "dummy") monkeypatch.setattr(jwt.PyJWK, "from_dict", staticmethod(lambda key: type("JWK", (), {"key": "dummy"})()))
monkeypatch.setattr( monkeypatch.setattr(
jwt, jwt,
"decode", "decode",
@ -36,7 +36,7 @@ def test_keycloak_verify_rejects_wrong_audience(monkeypatch) -> None:
kc = KeycloakOIDC("https://jwks", "https://issuer", "portal") kc = KeycloakOIDC("https://jwks", "https://issuer", "portal")
monkeypatch.setattr(kc, "_get_jwks", lambda force=False: {"keys": [{"kid": "test"}]}) monkeypatch.setattr(kc, "_get_jwks", lambda force=False: {"keys": [{"kid": "test"}]})
monkeypatch.setattr(jwt.algorithms.RSAAlgorithm, "from_jwk", lambda key: "dummy") monkeypatch.setattr(jwt.PyJWK, "from_dict", staticmethod(lambda key: type("JWK", (), {"key": "dummy"})()))
monkeypatch.setattr( monkeypatch.setattr(
jwt, jwt,
"decode", "decode",
@ -73,7 +73,7 @@ def test_keycloak_verify_refreshes_jwks(monkeypatch) -> None:
return {"keys": [{"kid": "test"}]} return {"keys": [{"kid": "test"}]}
monkeypatch.setattr(kc, "_get_jwks", fake_get_jwks) monkeypatch.setattr(kc, "_get_jwks", fake_get_jwks)
monkeypatch.setattr(jwt.algorithms.RSAAlgorithm, "from_jwk", lambda key: "dummy") monkeypatch.setattr(jwt.PyJWK, "from_dict", staticmethod(lambda key: type("JWK", (), {"key": "dummy"})()))
monkeypatch.setattr( monkeypatch.setattr(
jwt, jwt,
"decode", "decode",

View File

@ -0,0 +1,323 @@
from __future__ import annotations
from datetime import datetime, timezone
import types
from prometheus_client import REGISTRY
from ariadne.services import jenkins_workspace_cleanup as cleanup_module
def _metric_value(name: str, labels: dict[str, str]) -> float:
value = REGISTRY.get_sample_value(name, labels)
return float(value) if value is not None else 0.0
def _dummy_settings(*, dry_run: bool, max_deletions: int = 20) -> types.SimpleNamespace:
return types.SimpleNamespace(
jenkins_workspace_namespace="jenkins",
jenkins_workspace_pvc_prefix="pvc-workspace-",
jenkins_workspace_cleanup_min_age_hours=1.0,
jenkins_workspace_cleanup_dry_run=dry_run,
jenkins_workspace_cleanup_max_deletions_per_run=max_deletions,
)
def _fake_payloads(now_iso: str, old_iso: str) -> dict[str, dict[str, object]]:
return {
"/api/v1/namespaces/jenkins/pods": {
"items": [
{
"metadata": {
"annotations": {
"jenkins.io/workspace-pvc": "pvc-workspace-annotated-active",
}
},
"spec": {
"volumes": [
{"persistentVolumeClaim": {"claimName": "pvc-workspace-active"}},
]
},
}
]
},
"/api/v1/namespaces/jenkins/persistentvolumeclaims": {
"items": [
{
"metadata": {"name": "pvc-workspace-stale", "creationTimestamp": old_iso},
"status": {"phase": "Lost"},
},
{
"metadata": {"name": "pvc-workspace-active", "creationTimestamp": old_iso},
"status": {"phase": "Bound"},
},
{
"metadata": {"name": "pvc-workspace-annotated-active", "creationTimestamp": old_iso},
"status": {"phase": "Lost"},
},
{
"metadata": {"name": "pvc-workspace-fresh", "creationTimestamp": now_iso},
"status": {"phase": "Lost"},
},
{
"metadata": {
"name": "pvc-workspace-deleting",
"creationTimestamp": old_iso,
"deletionTimestamp": old_iso,
},
"status": {"phase": "Lost"},
},
]
},
"/api/v1/persistentvolumes": {
"items": [
{
"metadata": {"name": "pvc-old", "creationTimestamp": old_iso},
"status": {"phase": "Released"},
"spec": {"claimRef": {"namespace": "jenkins", "name": "pvc-workspace-stale"}},
},
{
"metadata": {"name": "pvc-active", "creationTimestamp": old_iso},
"status": {"phase": "Released"},
"spec": {"claimRef": {"namespace": "jenkins", "name": "pvc-workspace-active"}},
},
{
"metadata": {"name": "pvc-annotated", "creationTimestamp": old_iso},
"status": {"phase": "Released"},
"spec": {"claimRef": {"namespace": "jenkins", "name": "pvc-workspace-annotated-active"}},
},
{
"metadata": {"name": "pvc-fresh", "creationTimestamp": now_iso},
"status": {"phase": "Released"},
"spec": {"claimRef": {"namespace": "jenkins", "name": "pvc-workspace-fresh"}},
},
{
"metadata": {
"name": "pvc-deleting",
"creationTimestamp": old_iso,
"deletionTimestamp": old_iso,
},
"status": {"phase": "Released"},
"spec": {"claimRef": {"namespace": "jenkins", "name": "pvc-workspace-deleting"}},
},
]
},
"/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes": {
"items": [
{"metadata": {"name": "pvc-old", "creationTimestamp": old_iso}},
{
"metadata": {
"name": "pvc-orphan",
"creationTimestamp": old_iso,
"labels": {
"kubernetes.io/created-for/pvc/name": "pvc-workspace-orphan",
},
}
},
{
"metadata": {
"name": "pvc-attached",
"creationTimestamp": old_iso,
"labels": {
"kubernetes.io/created-for/pvc/name": "pvc-workspace-annotated-active",
},
},
"status": {"state": "attached", "isAttached": True, "robustness": "healthy"},
"spec": {"frontend": "blockdev"},
},
{
"metadata": {
"name": "pvc-orphan-fresh",
"creationTimestamp": now_iso,
"labels": {
"kubernetes.io/created-for/pvc/name": "pvc-workspace-fresh",
},
}
},
{
"metadata": {
"name": "pvc-vol-deleting",
"creationTimestamp": old_iso,
"deletionTimestamp": old_iso,
"labels": {
"kubernetes.io/created-for/pvc/name": "pvc-workspace-orphan",
},
}
},
]
},
}
def test_cleanup_jenkins_workspace_storage_dry_run(monkeypatch) -> None:
monkeypatch.setattr(cleanup_module, "settings", _dummy_settings(dry_run=True))
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
old_iso = "2020-01-01T00:00:00Z"
payloads = _fake_payloads(now_iso, old_iso)
deleted_paths: list[str] = []
def fake_get_json(path: str):
if path in payloads:
return payloads[path]
raise AssertionError(f"unexpected path: {path}")
def fake_delete_json(path: str):
deleted_paths.append(path)
return {"status": "Success"}
before_runs = _metric_value(
"ariadne_jenkins_workspace_cleanup_runs_total",
{"status": "ok", "mode": "dry_run"},
)
before_planned = _metric_value(
"ariadne_jenkins_workspace_cleanup_objects_total",
{"kind": "pvc", "action": "planned", "mode": "dry_run"},
)
monkeypatch.setattr(cleanup_module, "get_json", fake_get_json)
monkeypatch.setattr(cleanup_module, "delete_json", fake_delete_json)
summary = cleanup_module.cleanup_jenkins_workspace_storage()
assert summary.dry_run is True
assert summary.pvcs_planned == 1
assert summary.pvs_planned == 1
assert summary.volumes_planned == 2
assert summary.pvcs_deleted == 0
assert summary.pvs_deleted == 0
assert summary.volumes_deleted == 0
assert summary.failures == 0
assert deleted_paths == []
assert _metric_value(
"ariadne_jenkins_workspace_cleanup_runs_total",
{"status": "ok", "mode": "dry_run"},
) == before_runs + 1
assert _metric_value(
"ariadne_jenkins_workspace_cleanup_objects_total",
{"kind": "pvc", "action": "planned", "mode": "dry_run"},
) == before_planned + 1
def test_cleanup_jenkins_workspace_storage(monkeypatch) -> None:
monkeypatch.setattr(cleanup_module, "settings", _dummy_settings(dry_run=False))
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
old_iso = "2020-01-01T00:00:00Z"
deleted_paths: list[str] = []
payloads = _fake_payloads(now_iso, old_iso)
def fake_get_json(path: str):
if path in payloads:
return payloads[path]
raise AssertionError(f"unexpected path: {path}")
def fake_delete_json(path: str):
deleted_paths.append(path)
return {"status": "Success"}
before_runs = _metric_value(
"ariadne_jenkins_workspace_cleanup_runs_total",
{"status": "ok", "mode": "delete"},
)
before_deleted = _metric_value(
"ariadne_jenkins_workspace_cleanup_objects_total",
{"kind": "longhorn_volume", "action": "deleted", "mode": "delete"},
)
monkeypatch.setattr(cleanup_module, "get_json", fake_get_json)
monkeypatch.setattr(cleanup_module, "delete_json", fake_delete_json)
summary = cleanup_module.cleanup_jenkins_workspace_storage()
assert summary.pvcs_deleted == 1
assert summary.pvs_deleted == 1
assert summary.volumes_deleted == 2
assert summary.failures == 0
assert "/api/v1/namespaces/jenkins/persistentvolumeclaims/pvc-workspace-stale" in deleted_paths
assert "/api/v1/persistentvolumes/pvc-old" in deleted_paths
assert "/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes/pvc-old" in deleted_paths
assert "/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes/pvc-orphan" in deleted_paths
assert "/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes/pvc-attached" not in deleted_paths
assert _metric_value(
"ariadne_jenkins_workspace_cleanup_runs_total",
{"status": "ok", "mode": "delete"},
) == before_runs + 1
assert _metric_value(
"ariadne_jenkins_workspace_cleanup_objects_total",
{"kind": "longhorn_volume", "action": "deleted", "mode": "delete"},
) == before_deleted + 2
def test_cleanup_jenkins_workspace_storage_failure(monkeypatch) -> None:
monkeypatch.setattr(cleanup_module, "settings", _dummy_settings(dry_run=False))
def fake_get_json(path: str):
if path == "/api/v1/namespaces/jenkins/pods":
return {"items": []}
if path == "/api/v1/namespaces/jenkins/persistentvolumeclaims":
return {
"items": [
{
"metadata": {"name": "pvc-workspace-stale", "creationTimestamp": "2020-01-01T00:00:00Z"},
"status": {"phase": "Lost"},
}
]
}
if path == "/api/v1/persistentvolumes":
return {"items": []}
if path == "/apis/longhorn.io/v1beta2/namespaces/longhorn-system/volumes":
return {"items": []}
raise AssertionError(f"unexpected path: {path}")
def fake_delete_json(_path: str):
raise RuntimeError("boom")
before_failures = _metric_value(
"ariadne_jenkins_workspace_cleanup_objects_total",
{"kind": "cleanup", "action": "failed", "mode": "delete"},
)
monkeypatch.setattr(cleanup_module, "get_json", fake_get_json)
monkeypatch.setattr(cleanup_module, "delete_json", fake_delete_json)
summary = cleanup_module.cleanup_jenkins_workspace_storage()
assert summary.failures == 1
assert summary.pvcs_deleted == 0
assert _metric_value(
"ariadne_jenkins_workspace_cleanup_objects_total",
{"kind": "cleanup", "action": "failed", "mode": "delete"},
) == before_failures + 1
def test_cleanup_jenkins_workspace_storage_guard_caps_mass_delete(monkeypatch) -> None:
monkeypatch.setattr(cleanup_module, "settings", _dummy_settings(dry_run=False, max_deletions=1))
now_iso = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
old_iso = "2020-01-01T00:00:00Z"
payloads = _fake_payloads(now_iso, old_iso)
deleted_paths: list[str] = []
def fake_get_json(path: str):
if path in payloads:
return payloads[path]
raise AssertionError(f"unexpected path: {path}")
def fake_delete_json(path: str):
deleted_paths.append(path)
return {"status": "Success"}
monkeypatch.setattr(cleanup_module, "get_json", fake_get_json)
monkeypatch.setattr(cleanup_module, "delete_json", fake_delete_json)
summary = cleanup_module.cleanup_jenkins_workspace_storage()
assert summary.failures == 0
assert summary.pvcs_planned == 1
assert summary.pvs_planned == 1
assert summary.volumes_planned == 1
assert summary.pvcs_deleted == 1
assert summary.pvs_deleted == 0
assert summary.volumes_deleted == 0
assert summary.skipped == 2
assert deleted_paths == ["/api/v1/namespaces/jenkins/persistentvolumeclaims/pvc-workspace-stale"]