diff --git a/scripts/node_recover.sh b/scripts/node_recover.sh new file mode 100755 index 00000000..44e656f3 --- /dev/null +++ b/scripts/node_recover.sh @@ -0,0 +1,163 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat < [options] + +Options: + --yes Skip confirmation prompt + --skip-drain Do not cordon/drain; only capture recovery artifacts + --delete-node Delete Node object after drain (for hard-dead node replacement) + --out-dir Recovery artifact directory (default: ./artifacts/node-recovery) + -h, --help Show this help +USAGE +} + +if ! command -v kubectl >/dev/null 2>&1; then + echo "kubectl is required" >&2 + exit 1 +fi +if ! command -v jq >/dev/null 2>&1; then + echo "jq is required" >&2 + exit 1 +fi + +if [ "$#" -lt 1 ]; then + usage + exit 1 +fi + +node="" +assume_yes="false" +skip_drain="false" +delete_node="false" +out_dir="./artifacts/node-recovery" + +while [ "$#" -gt 0 ]; do + case "$1" in + --yes) + assume_yes="true" + shift + ;; + --skip-drain) + skip_drain="true" + shift + ;; + --delete-node) + delete_node="true" + shift + ;; + --out-dir) + out_dir="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + -*) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + *) + if [ -z "${node}" ]; then + node="$1" + else + echo "Unexpected argument: $1" >&2 + usage + exit 1 + fi + shift + ;; + esac +done + +if [ -z "${node}" ]; then + echo "Node name is required" >&2 + usage + exit 1 +fi + +if ! kubectl get node "${node}" >/dev/null 2>&1; then + echo "Node ${node} not found in cluster API" >&2 + exit 1 +fi + +if [ "${assume_yes}" != "true" ]; then + echo "About to prepare recovery workflow for node: ${node}" + echo "skip_drain=${skip_drain} delete_node=${delete_node}" + read -r -p "Type the node name to continue: " confirm + if [ "${confirm}" != "${node}" ]; then + echo "Confirmation did not match node name; aborting." + exit 1 + fi +fi + +timestamp="$(date +%Y%m%d-%H%M%S)" +artifacts_dir="${out_dir}/${node}-${timestamp}" +mkdir -p "${artifacts_dir}" + +echo "Saving node and workload artifacts to ${artifacts_dir}" +kubectl get node "${node}" -o json > "${artifacts_dir}/node.json" +kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt" +kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt" + +jq -r ' + .metadata.labels + | to_entries[] + | select( + .key != "kubernetes.io/hostname" + and .key != "beta.kubernetes.io/hostname" + and .key != "node.kubernetes.io/instance-type" + and .key != "beta.kubernetes.io/instance-type" + and (.key | startswith("kubernetes.io/") | not) + and (.key | startswith("beta.kubernetes.io/") | not) + and (.key | startswith("node.kubernetes.io/") | not) + ) + | "kubectl label node " + .key + "=" + .value + " --overwrite" +' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh" + +jq -r ' + (.spec.taints // [])[] + | "kubectl taint node " + + .key + + (if .value then "=" + .value else "" end) + + ":" + + .effect + + " --overwrite" +' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh" + +chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh" + +if [ "${skip_drain}" != "true" ]; then + echo "Cordoning ${node}" + kubectl cordon "${node}" || true + + echo "Draining ${node}" + if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then + echo "Standard drain failed; retrying with --force" + if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then + echo "Force drain failed; retrying with --disable-eviction" + kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction + fi + fi +fi + +if [ "${delete_node}" = "true" ]; then + echo "Deleting node object ${node}" + kubectl delete node "${node}" || true +fi + +cat < --yes --delete-node` from `titan-iac`. +2. Reimage/reprovision the replacement host. +3. Rejoin the replacement node to k3s. +4. Reapply labels and taints from generated artifacts: + - `artifacts/node-recovery/-/restore-labels.sh` + - `artifacts/node-recovery/-/restore-taints.sh` +5. Verify workloads, then uncordon the replacement node. + +### Notes +- `node_recover.sh` snapshots node labels/taints and current pod placement before drain. +- Use `--skip-drain` for a dead/unreachable node where only artifact capture is possible. +- Use `--delete-node` after drain (or for hard-dead nodes) so replacement join is clean. + ## Node classes (current map) - rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent) - rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint) diff --git a/services/logging/data-prepper-helmrelease.yaml b/services/logging/data-prepper-helmrelease.yaml index 1c0bc45d..acf9a352 100644 --- a/services/logging/data-prepper-helmrelease.yaml +++ b/services/logging/data-prepper-helmrelease.yaml @@ -40,15 +40,25 @@ spec: memory: "512Mi" limits: memory: "1Gi" - nodeSelector: - node-role.kubernetes.io/worker: "true" - hardware: rpi5 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: + - matchExpressions: + - key: jetson + operator: In + values: + - "true" - matchExpressions: - key: hardware operator: In values: - rpi5 + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: jetson + operator: In + values: + - "true" diff --git a/services/logging/fluent-bit-helmrelease.yaml b/services/logging/fluent-bit-helmrelease.yaml index e16890c9..a330340e 100644 --- a/services/logging/fluent-bit-helmrelease.yaml +++ b/services/logging/fluent-bit-helmrelease.yaml @@ -51,7 +51,7 @@ spec: service: | [SERVICE] Flush 1 - Log_Level info + Log_Level warn Daemon Off Parsers_File parsers.conf Parsers_File custom_parsers.conf @@ -74,7 +74,7 @@ spec: Refresh_Interval 10 Rotate_Wait 30 Inotify_Watcher false - Read_from_Head On + Read_from_Head Off DB /var/lib/fluent-bit/kube.db storage.type filesystem @@ -82,7 +82,7 @@ spec: Name systemd Tag journald.* Path /var/log/journal - Read_From_Tail Off + Read_From_Tail On DB /var/lib/fluent-bit/systemd.db storage.type filesystem filters: | @@ -107,7 +107,7 @@ spec: Logstash_Prefix kube Replace_Dots On Suppress_Type_Name On - Retry_Limit False + Retry_Limit 10 [OUTPUT] Name es @@ -119,4 +119,4 @@ spec: Logstash_Prefix journald Replace_Dots On Suppress_Type_Name On - Retry_Limit False + Retry_Limit 10 diff --git a/services/logging/node-log-rotation-daemonset.yaml b/services/logging/node-log-rotation-daemonset.yaml index f6a672c3..b7753c36 100644 --- a/services/logging/node-log-rotation-daemonset.yaml +++ b/services/logging/node-log-rotation-daemonset.yaml @@ -24,7 +24,17 @@ spec: operator: Exists effect: NoSchedule nodeSelector: - hardware: rpi5 + node-role.kubernetes.io/worker: "true" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: hardware + operator: In + values: + - rpi4 + - rpi5 containers: - name: node-log-rotation image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 diff --git a/services/logging/opensearch-dashboards-helmrelease.yaml b/services/logging/opensearch-dashboards-helmrelease.yaml index 85f859e7..bf6c41fc 100644 --- a/services/logging/opensearch-dashboards-helmrelease.yaml +++ b/services/logging/opensearch-dashboards-helmrelease.yaml @@ -37,15 +37,25 @@ spec: limits: cpu: "200m" memory: "512Mi" - nodeSelector: - node-role.kubernetes.io/worker: "true" - hardware: rpi5 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: + - matchExpressions: + - key: jetson + operator: In + values: + - "true" - matchExpressions: - key: hardware operator: In values: - rpi5 + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: jetson + operator: In + values: + - "true" diff --git a/services/logging/opensearch-helmrelease.yaml b/services/logging/opensearch-helmrelease.yaml index 3d7dd6b7..c43a2b9e 100644 --- a/services/logging/opensearch-helmrelease.yaml +++ b/services/logging/opensearch-helmrelease.yaml @@ -40,17 +40,27 @@ spec: discovery.type: single-node plugins.security.disabled: true node.store.allow_mmap: false - nodeSelector: - node-role.kubernetes.io/worker: "true" - hardware: rpi5 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: + - matchExpressions: + - key: jetson + operator: In + values: + - "true" - matchExpressions: - key: hardware operator: In values: - rpi5 + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: jetson + operator: In + values: + - "true" sysctlInit: enabled: true diff --git a/services/logging/otel-collector-helmrelease.yaml b/services/logging/otel-collector-helmrelease.yaml index c24682f5..b1bcc25a 100644 --- a/services/logging/otel-collector-helmrelease.yaml +++ b/services/logging/otel-collector-helmrelease.yaml @@ -76,15 +76,25 @@ spec: memory: "256Mi" limits: memory: "512Mi" - nodeSelector: - node-role.kubernetes.io/worker: "true" - hardware: rpi5 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: + - matchExpressions: + - key: jetson + operator: In + values: + - "true" - matchExpressions: - key: hardware operator: In values: - rpi5 + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: jetson + operator: In + values: + - "true" diff --git a/services/logging/scripts/node_log_rotation.sh b/services/logging/scripts/node_log_rotation.sh index 534806ff..c12847e0 100644 --- a/services/logging/scripts/node_log_rotation.sh +++ b/services/logging/scripts/node_log_rotation.sh @@ -12,39 +12,77 @@ k3s_agent_dropin="/host/etc/systemd/system/k3s-agent.service.d/99-logging.conf" k3s_image_gc_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf" k3s_agent_image_gc_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf" -if [ ! -f "${journald_dropin}" ]; then - mkdir -p "$(dirname "${journald_dropin}")" - printf "[Journal]\nStorage=volatile\nRuntimeMaxUse=200M\nRuntimeKeepFree=512M\nMaxFileSec=1h\n" > "${journald_dropin}" - changed=1 - journald_changed=1 +ensure_dropin() { + local path="$1" + local owner="$2" + local new_content="$3" + local current="" + if [ -f "${path}" ]; then + current="$(cat "${path}" || true)" + fi + if [ "${current}" != "${new_content}" ]; then + mkdir -p "$(dirname "${path}")" + printf "%s\n" "${new_content}" > "${path}" + changed=1 + case "${owner}" in + journald) + journald_changed=1 + ;; + k3s) + k3s_changed=1 + ;; + k3s-agent) + k3s_agent_changed=1 + ;; + esac + fi +} + +ensure_dropin \ + "${journald_dropin}" \ + "journald" \ + "[Journal] +Storage=volatile +RuntimeMaxUse=200M +RuntimeKeepFree=512M +MaxFileSec=1h" + +if [ -f "/host/etc/systemd/system/k3s.service" ]; then + ensure_dropin \ + "${k3s_dropin}" \ + "k3s" \ + "[Service] +Environment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\" +Environment=\"K3S_KUBELET_ARG=container-log-max-files=2\"" fi -if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_dropin}" ]; then - mkdir -p "$(dirname "${k3s_dropin}")" - printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_dropin}" - changed=1 - k3s_changed=1 +if [ -f "/host/etc/systemd/system/k3s.service" ]; then + ensure_dropin \ + "${k3s_image_gc_dropin}" \ + "k3s" \ + "[Service] +Environment=\"K3S_KUBELET_ARG=image-gc-high-threshold=65\" +Environment=\"K3S_KUBELET_ARG=image-gc-low-threshold=50\" +Environment=\"K3S_KUBELET_ARG=image-gc-minimum-available=8Gi\"" fi -if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_image_gc_dropin}" ]; then - mkdir -p "$(dirname "${k3s_image_gc_dropin}")" - printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_image_gc_dropin}" - changed=1 - k3s_changed=1 +if [ -f "/host/etc/systemd/system/k3s-agent.service" ]; then + ensure_dropin \ + "${k3s_agent_dropin}" \ + "k3s-agent" \ + "[Service] +Environment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\" +Environment=\"K3S_KUBELET_ARG=container-log-max-files=2\"" fi -if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_dropin}" ]; then - mkdir -p "$(dirname "${k3s_agent_dropin}")" - printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_agent_dropin}" - changed=1 - k3s_agent_changed=1 -fi - -if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_image_gc_dropin}" ]; then - mkdir -p "$(dirname "${k3s_agent_image_gc_dropin}")" - printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_agent_image_gc_dropin}" - changed=1 - k3s_agent_changed=1 +if [ -f "/host/etc/systemd/system/k3s-agent.service" ]; then + ensure_dropin \ + "${k3s_agent_image_gc_dropin}" \ + "k3s-agent" \ + "[Service] +Environment=\"K3S_KUBELET_ARG=image-gc-high-threshold=65\" +Environment=\"K3S_KUBELET_ARG=image-gc-low-threshold=50\" +Environment=\"K3S_KUBELET_ARG=image-gc-minimum-available=8Gi\"" fi if [ "${changed}" -eq 1 ]; then diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index fce1ded5..e5eacf6f 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -18,6 +18,7 @@ spec: prometheus.io/scrape: "true" prometheus.io/port: "8080" prometheus.io/path: "/metrics" + maintenance.bstein.dev/restart-rev: "20260207-2" vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/role: "maintenance" vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db" @@ -105,7 +106,7 @@ spec: node-role.kubernetes.io/worker: "true" containers: - name: ariadne - image: registry.bstein.dev/bstein/ariadne:0.1.0-0 + image: registry.bstein.dev/bstein/ariadne:latest imagePullPolicy: Always command: ["/bin/sh", "-c"] args: @@ -285,7 +286,7 @@ spec: - name: ARIADNE_SCHEDULE_MAILU_SYNC value: "30 4 * * *" - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC - value: "0 5 * * *" + value: "*/15 * * * *" - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON value: "*/5 * * * *" - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE @@ -293,23 +294,23 @@ spec: - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC value: "0 * * * *" - name: ARIADNE_SCHEDULE_WGER_USER_SYNC - value: "0 5 * * *" + value: "*/15 * * * *" - name: ARIADNE_SCHEDULE_WGER_ADMIN value: "15 3 * * *" - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC - value: "0 6 * * *" + value: "*/15 * * * *" - name: ARIADNE_SCHEDULE_FIREFLY_CRON value: "0 3 * * *" - name: ARIADNE_SCHEDULE_POD_CLEANER - value: "0 * * * *" + value: "*/30 * * * *" - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE value: "23 3 * * *" - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER - value: "30 4 * * 0" + value: "0 */4 * * *" - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH - value: "0 * * * *" + value: "*/15 * * * *" - name: ARIADNE_SCHEDULE_VAULT_OIDC - value: "0 * * * *" + value: "*/15 * * * *" - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME value: "*/5 * * * *" - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE @@ -319,9 +320,9 @@ spec: - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM value: "*/10 * * * *" - name: ARIADNE_SCHEDULE_CLUSTER_STATE - value: "*/15 * * * *" + value: "*/10 * * * *" - name: ARIADNE_CLUSTER_STATE_KEEP - value: "168" + value: "720" - name: WELCOME_EMAIL_ENABLED value: "true" - name: K8S_API_TIMEOUT_SEC @@ -330,6 +331,8 @@ spec: value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC value: "5" + - name: ARIADNE_ALERTMANAGER_URL + value: http://alertmanager.monitoring.svc.cluster.local - name: OPENSEARCH_URL value: http://opensearch-master.logging.svc.cluster.local:9200 - name: OPENSEARCH_LIMIT_BYTES diff --git a/services/maintenance/node-image-sweeper-daemonset.yaml b/services/maintenance/node-image-sweeper-daemonset.yaml index 488c0605..5b03cdc4 100644 --- a/services/maintenance/node-image-sweeper-daemonset.yaml +++ b/services/maintenance/node-image-sweeper-daemonset.yaml @@ -33,17 +33,15 @@ spec: command: ["/bin/sh", "/scripts/node_image_sweeper.sh"] env: - name: SWEEP_INTERVAL_SEC - value: "21600" + value: "7200" - name: HIGH_USAGE_PERCENT value: "70" - name: EMERGENCY_USAGE_PERCENT value: "80" - - name: BASE_THRESHOLD_DAYS - value: "14" - - name: HIGH_USAGE_THRESHOLD_DAYS - value: "3" - name: LOG_RETENTION_DAYS value: "7" + - name: ORPHAN_POD_RETENTION_DAYS + value: "3" - name: JOURNAL_MAX_SIZE value: "200M" securityContext: diff --git a/services/maintenance/scripts/node_image_sweeper.sh b/services/maintenance/scripts/node_image_sweeper.sh index c2fb6da1..98eedd8f 100644 --- a/services/maintenance/scripts/node_image_sweeper.sh +++ b/services/maintenance/scripts/node_image_sweeper.sh @@ -3,96 +3,71 @@ set -eu ONE_SHOT=${ONE_SHOT:-false} SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600} -BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14} -HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3} HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70} EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85} LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7} +ORPHAN_POD_RETENTION_DAYS=${ORPHAN_POD_RETENTION_DAYS:-3} JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M} -SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause" + +cleanup_orphaned_hdd_pod_logs() { + if [ ! -d /host/var/log.hdd/pods ]; then + return 0 + fi + + ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY' +import os +import shutil +import time + +hdd_pods = "/host/var/log.hdd/pods" +active_pods = "/host/var/log/pods" +retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3")) +cutoff = time.time() - (retention_days * 86400) + +try: + active_names = set(os.listdir(active_pods)) +except Exception: + active_names = set() + +try: + hdd_names = os.listdir(hdd_pods) +except Exception: + hdd_names = [] + +for name in hdd_names: + path = os.path.join(hdd_pods, name) + if not os.path.isdir(path): + continue + if name in active_names: + continue + try: + mtime = os.path.getmtime(path) + except Exception: + continue + if mtime > cutoff: + continue + print(path) + shutil.rmtree(path, ignore_errors=True) +PY +} sweep_once() { usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage="" - threshold_days="${BASE_THRESHOLD_DAYS}" + + # crictl image metadata frequently omits createdAt on this cluster; prune by + # runtime reachability whenever rootfs crosses pressure thresholds. if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then - threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}" + chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true" fi - cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY' -import os -import time + cleanup_orphaned_hdd_pod_logs -days = int(os.environ.get("THRESHOLD_DAYS", "14")) -print(int(time.time()) - days * 86400) -PY -) + if [ -d /host/var/log.hdd/pods ]; then + find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true + fi - RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ') - IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}') - - prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY' -import json -import os -import sys -import time - -try: - data = json.load(sys.stdin) -except Exception: - print("", end="") - sys.exit(0) - -cutoff = int(os.environ.get("CUTOFF", "0")) -running = set(os.environ.get("RUNNING", "").split()) -skip = os.environ.get("SKIP", "").split() -now = int(time.time()) -prune = [] - - -def is_skip(tags): - if not tags: - return False - for t in tags: - for prefix in skip: - if prefix and t.startswith(prefix): - return True - return False - - -for img in data.get("images", []): - image_id = img.get("id", "") - if not image_id: - continue - if image_id in running: - continue - tags = img.get("repoTags") or [] - if is_skip(tags): - continue - created = img.get("createdAt") or 0 - try: - created = int(str(created)) // 1000000000 - except Exception: - created = 0 - if created and created > now: - created = now - if cutoff and created and created < cutoff: - prune.append(image_id) - -seen = set() -for p in prune: - if p in seen: - continue - seen.add(p) - print(p) -PY -) - - if [ -n "${prune_list}" ]; then - printf "%s" "${prune_list}" | while read -r image_id; do - if [ -n "${image_id}" ]; then - chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true - fi - done + if [ -d /host/var/log.hdd/containers ]; then + find /host/var/log.hdd/containers -xtype l -print -delete 2>/dev/null || true fi find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true @@ -100,9 +75,11 @@ PY if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then # Emergency pass for rootfs pressure on SD-backed nodes. + chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true" chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true" find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true + find /host/var/log.hdd -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi" fi } diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml index 0bc792f4..934a0721 100644 --- a/services/monitoring/grafana-alerting-config.yaml +++ b/services/monitoring/grafana-alerting-config.yaml @@ -303,8 +303,56 @@ data: summary: "node-image-sweeper not fully ready" labels: severity: warning + - uid: logging-node-log-rotation-not-ready + title: "Node log rotation guardrails not ready" + condition: C + for: "10m" + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: atlas-vm + model: + intervalMs: 60000 + maxDataPoints: 43200 + expr: kube_daemonset_status_number_ready{namespace="logging",daemonset="node-log-rotation"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="logging",daemonset="node-log-rotation"} + legendFormat: '{{daemonset}}' + datasource: + type: prometheus + uid: atlas-vm + - refId: B + datasourceUid: __expr__ + model: + expression: A + intervalMs: 60000 + maxDataPoints: 43200 + reducer: last + type: reduce + - refId: C + datasourceUid: __expr__ + model: + expression: B + intervalMs: 60000 + maxDataPoints: 43200 + type: threshold + conditions: + - evaluator: + params: [1] + type: lt + operator: + type: and + reducer: + type: last + type: query + noDataState: NoData + execErrState: Error + annotations: + summary: "node-log-rotation is not fully ready" + labels: + severity: warning - uid: maint-ariadne-image-sweeper-stale - title: "Ariadne image sweeper stale (schedule >8d)" + title: "Ariadne image sweeper stale (schedule >24h)" condition: C for: "5m" data: @@ -338,7 +386,7 @@ data: type: threshold conditions: - evaluator: - params: [691200] + params: [86400] type: gt operator: type: and @@ -348,7 +396,7 @@ data: noDataState: OK execErrState: Error annotations: - summary: "Ariadne image sweeper stale >8d since last success" + summary: "Ariadne image sweeper stale >24h since last success" labels: severity: warning - uid: maint-cron-stale