ariadne/ariadne/services/jenkins_workspace_cleanup.py

392 lines
14 KiB
Python

from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from prometheus_client import Counter, Gauge
from ..k8s.client import delete_json, get_json
from ..settings import settings
from ..utils.logging import get_logger
from .jenkins_workspace_candidates import (
_CleanupCandidate,
_active_workspace_claims,
_workspace_longhorn_candidates,
_workspace_pv_candidates,
_workspace_pvc_candidates,
)
logger = get_logger(__name__)
JENKINS_WORKSPACE_CLEANUP_RUNS_TOTAL = Counter(
"ariadne_jenkins_workspace_cleanup_runs_total",
"Jenkins workspace cleanup runs by status and mode",
["status", "mode"],
)
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL = Counter(
"ariadne_jenkins_workspace_cleanup_objects_total",
"Jenkins workspace cleanup objects by kind, action, and mode",
["kind", "action", "mode"],
)
JENKINS_WORKSPACE_CLEANUP_LAST_RUN_TS = Gauge(
"ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds",
"Last Jenkins workspace cleanup run timestamp",
)
JENKINS_WORKSPACE_CLEANUP_LAST_SUCCESS_TS = Gauge(
"ariadne_jenkins_workspace_cleanup_last_success_timestamp_seconds",
"Last successful Jenkins workspace cleanup timestamp",
)
JENKINS_WORKSPACE_CLEANUP_LAST_FAILURE_TS = Gauge(
"ariadne_jenkins_workspace_cleanup_last_failure_timestamp_seconds",
"Last failed Jenkins workspace cleanup timestamp",
)
JENKINS_WORKSPACE_CLEANUP_LAST_DELETED = Gauge(
"ariadne_jenkins_workspace_cleanup_last_deleted_total",
"Last Jenkins workspace cleanup deleted object count",
["kind"],
)
JENKINS_WORKSPACE_CLEANUP_LAST_PLANNED = Gauge(
"ariadne_jenkins_workspace_cleanup_last_planned_total",
"Last Jenkins workspace cleanup planned object count",
["kind"],
)
JENKINS_WORKSPACE_CLEANUP_LAST_SKIPPED = Gauge(
"ariadne_jenkins_workspace_cleanup_last_skipped_total",
"Last Jenkins workspace cleanup skipped object count",
)
JENKINS_WORKSPACE_CLEANUP_LAST_FAILURES = Gauge(
"ariadne_jenkins_workspace_cleanup_last_failures_total",
"Last Jenkins workspace cleanup failure count",
)
@dataclass(frozen=True)
class JenkinsWorkspaceCleanupSummary:
"""Summarize one Jenkins workspace-storage cleanup pass.
Inputs: Kubernetes PV/PVC/Longhorn objects fetched from the API server.
Outputs: deterministic counters for operator logs and metrics.
"""
pvs_planned: int
pvcs_planned: int
volumes_planned: int
pvs_deleted: int
pvcs_deleted: int
volumes_deleted: int
skipped: int
failures: int
dry_run: bool
@property
def planned(self) -> int:
return self.pvs_planned + self.pvcs_planned + self.volumes_planned
@property
def deleted(self) -> int:
return self.pvs_deleted + self.pvcs_deleted + self.volumes_deleted
def _validate_cleanup_settings() -> tuple[str, str, bool, int]:
namespace = settings.jenkins_workspace_namespace
prefix = settings.jenkins_workspace_pvc_prefix.strip()
dry_run = settings.jenkins_workspace_cleanup_dry_run
max_deletions = settings.jenkins_workspace_cleanup_max_deletions_per_run
if not namespace.strip():
raise ValueError("jenkins workspace cleanup namespace is empty")
if not prefix:
raise ValueError("jenkins workspace cleanup pvc prefix is empty")
if settings.jenkins_workspace_cleanup_min_age_hours < 1.0:
raise ValueError("jenkins workspace cleanup min age must be >= 1 hour")
if max_deletions < 1:
raise ValueError("jenkins workspace cleanup max deletions must be >= 1")
return namespace, prefix, dry_run, max_deletions
def _planned_removed_pv_names_dry_run(
stale_pvcs: list[_CleanupCandidate],
stale_pvs: list[_CleanupCandidate],
max_deletions: int,
) -> set[str]:
remaining = max(max_deletions - len(stale_pvcs), 0)
if remaining == 0:
return set()
names = [candidate.name for candidate in stale_pvs if candidate.name]
return set(names[:remaining])
def _delete_candidates(
candidates: list[_CleanupCandidate],
*,
deletion_budget: int | None,
failure_log: str,
failure_field: str,
removed_pv_names: set[str] | None = None,
) -> tuple[int, int, int, int | None]:
deleted = 0
skipped = 0
failures = 0
budget = deletion_budget
for candidate in candidates:
if not candidate.name:
skipped += 1
continue
if budget is not None and budget <= 0:
skipped += 1
continue
if budget is not None:
budget -= 1
try:
delete_json(candidate.path)
deleted += 1
if removed_pv_names is not None:
removed_pv_names.add(candidate.name)
except Exception as exc:
failures += 1
logger.info(
failure_log,
extra={"event": "jenkins_workspace_cleanup", failure_field: candidate.name, "detail": str(exc)},
)
return deleted, skipped, failures, budget
def _record_guard_cap(
*,
max_deletions: int,
stale_pvcs: list[_CleanupCandidate],
stale_pvs: list[_CleanupCandidate],
stale_volumes: list[_CleanupCandidate],
dry_run: bool,
) -> None:
planned_total = len(stale_pvcs) + len(stale_pvs) + len(stale_volumes)
if planned_total <= max_deletions:
return
logger.warning(
"jenkins workspace cleanup capped by max deletions guard",
extra={
"event": "jenkins_workspace_cleanup",
"status": "guard_capped",
"namespace": settings.jenkins_workspace_namespace,
"dry_run": dry_run,
"planned_total": planned_total,
"max_deletions": max_deletions,
"planned_pvs": len(stale_pvs),
"planned_pvcs": len(stale_pvcs),
"planned_volumes": len(stale_volumes),
},
)
def _dry_run_summary(
*,
namespace: str,
max_deletions: int,
stale_pvcs: list[_CleanupCandidate],
stale_pvs: list[_CleanupCandidate],
all_pv_names: set[str],
) -> JenkinsWorkspaceCleanupSummary:
simulated_removed = _planned_removed_pv_names_dry_run(stale_pvcs, stale_pvs, max_deletions)
stale_volumes = _workspace_longhorn_candidates(settings, get_json, all_pv_names, simulated_removed)
_record_guard_cap(
max_deletions=max_deletions,
stale_pvcs=stale_pvcs,
stale_pvs=stale_pvs,
stale_volumes=stale_volumes,
dry_run=True,
)
logger.info(
"jenkins workspace cleanup dry-run enabled",
extra={
"event": "jenkins_workspace_cleanup",
"status": "dry_run",
"namespace": namespace,
"dry_run": True,
"planned_pvs": len(stale_pvs),
"planned_pvcs": len(stale_pvcs),
"planned_volumes": len(stale_volumes),
"max_deletions": max_deletions,
},
)
return JenkinsWorkspaceCleanupSummary(
pvs_planned=len(stale_pvs),
pvcs_planned=len(stale_pvcs),
volumes_planned=len(stale_volumes),
pvs_deleted=0,
pvcs_deleted=0,
volumes_deleted=0,
skipped=0,
failures=0,
dry_run=True,
)
def _delete_run_summary(
*,
namespace: str,
max_deletions: int,
stale_pvcs: list[_CleanupCandidate],
stale_pvs: list[_CleanupCandidate],
all_pv_names: set[str],
) -> JenkinsWorkspaceCleanupSummary:
removed_pv_names: set[str] = set()
deletion_budget: int | None = max_deletions
pvcs_deleted, pvc_skipped, pvc_failures, deletion_budget = _delete_candidates(
stale_pvcs,
deletion_budget=deletion_budget,
failure_log="jenkins workspace pvc delete failed",
failure_field="claim",
)
pvs_deleted, pv_skipped, pv_failures, deletion_budget = _delete_candidates(
stale_pvs,
deletion_budget=deletion_budget,
failure_log="jenkins workspace pv delete failed",
failure_field="pv",
removed_pv_names=removed_pv_names,
)
stale_volumes = _workspace_longhorn_candidates(settings, get_json, all_pv_names, removed_pv_names)
_record_guard_cap(
max_deletions=max_deletions,
stale_pvcs=stale_pvcs,
stale_pvs=stale_pvs,
stale_volumes=stale_volumes,
dry_run=False,
)
volumes_deleted, volume_skipped, volume_failures, _ = _delete_candidates(
stale_volumes,
deletion_budget=deletion_budget,
failure_log="jenkins workspace longhorn volume delete failed",
failure_field="volume",
)
return JenkinsWorkspaceCleanupSummary(
pvs_planned=len(stale_pvs),
pvcs_planned=len(stale_pvcs),
volumes_planned=len(stale_volumes),
pvs_deleted=pvs_deleted,
pvcs_deleted=pvcs_deleted,
volumes_deleted=volumes_deleted,
skipped=pvc_skipped + pv_skipped + volume_skipped,
failures=pvc_failures + pv_failures + volume_failures,
dry_run=False,
)
def _record_metrics(summary: JenkinsWorkspaceCleanupSummary) -> None:
mode = "dry_run" if summary.dry_run else "delete"
status = "ok" if summary.failures == 0 else "error"
JENKINS_WORKSPACE_CLEANUP_RUNS_TOTAL.labels(status=status, mode=mode).inc()
if summary.failures:
JENKINS_WORKSPACE_CLEANUP_LAST_FAILURE_TS.set(datetime.now(timezone.utc).timestamp())
else:
JENKINS_WORKSPACE_CLEANUP_LAST_SUCCESS_TS.set(datetime.now(timezone.utc).timestamp())
JENKINS_WORKSPACE_CLEANUP_LAST_RUN_TS.set(datetime.now(timezone.utc).timestamp())
JENKINS_WORKSPACE_CLEANUP_LAST_DELETED.labels(kind="pvc").set(summary.pvcs_deleted)
JENKINS_WORKSPACE_CLEANUP_LAST_DELETED.labels(kind="pv").set(summary.pvs_deleted)
JENKINS_WORKSPACE_CLEANUP_LAST_DELETED.labels(kind="longhorn_volume").set(summary.volumes_deleted)
JENKINS_WORKSPACE_CLEANUP_LAST_PLANNED.labels(kind="pvc").set(summary.pvcs_planned)
JENKINS_WORKSPACE_CLEANUP_LAST_PLANNED.labels(kind="pv").set(summary.pvs_planned)
JENKINS_WORKSPACE_CLEANUP_LAST_PLANNED.labels(kind="longhorn_volume").set(summary.volumes_planned)
JENKINS_WORKSPACE_CLEANUP_LAST_SKIPPED.set(summary.skipped)
JENKINS_WORKSPACE_CLEANUP_LAST_FAILURES.set(summary.failures)
for kind, planned, deleted in (
("pvc", summary.pvcs_planned, summary.pvcs_deleted),
("pv", summary.pvs_planned, summary.pvs_deleted),
("longhorn_volume", summary.volumes_planned, summary.volumes_deleted),
):
if planned:
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL.labels(kind=kind, action="planned", mode=mode).inc(planned)
if deleted:
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL.labels(kind=kind, action="deleted", mode=mode).inc(deleted)
if summary.skipped:
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL.labels(
kind="cleanup",
action="skipped",
mode=mode,
).inc(summary.skipped)
if summary.failures:
JENKINS_WORKSPACE_CLEANUP_OBJECTS_TOTAL.labels(
kind="cleanup",
action="failed",
mode=mode,
).inc(summary.failures)
def cleanup_jenkins_workspace_storage() -> JenkinsWorkspaceCleanupSummary:
"""Delete stale Jenkins workspace PVC/PV artifacts and orphan Longhorn volumes."""
summary = JenkinsWorkspaceCleanupSummary(
pvs_planned=0,
pvcs_planned=0,
volumes_planned=0,
pvs_deleted=0,
pvcs_deleted=0,
volumes_deleted=0,
skipped=0,
failures=0,
dry_run=settings.jenkins_workspace_cleanup_dry_run,
)
try:
namespace, _prefix, dry_run, max_deletions = _validate_cleanup_settings()
active_claims = _active_workspace_claims(settings, get_json)
stale_pvs, all_pv_names = _workspace_pv_candidates(settings, get_json, active_claims)
stale_pvcs = _workspace_pvc_candidates(settings, get_json, active_claims)
if dry_run:
summary = _dry_run_summary(
namespace=namespace,
max_deletions=max_deletions,
stale_pvcs=stale_pvcs,
stale_pvs=stale_pvs,
all_pv_names=all_pv_names,
)
else:
summary = _delete_run_summary(
namespace=namespace,
max_deletions=max_deletions,
stale_pvcs=stale_pvcs,
stale_pvs=stale_pvs,
all_pv_names=all_pv_names,
)
except Exception as exc:
logger.exception(
"jenkins workspace cleanup failed",
extra={
"event": "jenkins_workspace_cleanup",
"status": "error",
"namespace": settings.jenkins_workspace_namespace,
"detail": str(exc),
},
)
summary = JenkinsWorkspaceCleanupSummary(
pvs_planned=summary.pvs_planned,
pvcs_planned=summary.pvcs_planned,
volumes_planned=summary.volumes_planned,
pvs_deleted=summary.pvs_deleted,
pvcs_deleted=summary.pvcs_deleted,
volumes_deleted=summary.volumes_deleted,
skipped=summary.skipped,
failures=summary.failures + 1,
dry_run=summary.dry_run,
)
_record_metrics(summary)
raise
_record_metrics(summary)
logger.info(
"jenkins workspace cleanup finished",
extra={
"event": "jenkins_workspace_cleanup",
"status": "ok" if summary.failures == 0 else "error",
"dry_run": summary.dry_run,
"namespace": namespace,
"planned_pvs": summary.pvs_planned,
"planned_pvcs": summary.pvcs_planned,
"planned_volumes": summary.volumes_planned,
"deleted_pvs": summary.pvs_deleted,
"deleted_pvcs": summary.pvcs_deleted,
"deleted_volumes": summary.volumes_deleted,
"skipped": summary.skipped,
"failures": summary.failures,
},
)
return summary