maintenance: harden sd-write controls and recovery workflow

2026-03-31 00:06:44 -03:00 · 2026-03-31 00:06:44 -03:00 · be92017f4d
commit be92017f4d
parent 678d0efa2c
13 changed files with 432 additions and 141 deletions
--- a/scripts/node_recover.sh
+++ b/scripts/node_recover.sh
@ -0,0 +1,163 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat <<USAGE
+Usage: scripts/node_recover.sh <node-name> [options]
+
+Options:
+  --yes             Skip confirmation prompt
+  --skip-drain      Do not cordon/drain; only capture recovery artifacts
+  --delete-node     Delete Node object after drain (for hard-dead node replacement)
+  --out-dir <dir>   Recovery artifact directory (default: ./artifacts/node-recovery)
+  -h, --help        Show this help
+USAGE
+}
+
+if ! command -v kubectl >/dev/null 2>&1; then
+  echo "kubectl is required" >&2
+  exit 1
+fi
+if ! command -v jq >/dev/null 2>&1; then
+  echo "jq is required" >&2
+  exit 1
+fi
+
+if [ "$#" -lt 1 ]; then
+  usage
+  exit 1
+fi
+
+node=""
+assume_yes="false"
+skip_drain="false"
+delete_node="false"
+out_dir="./artifacts/node-recovery"
+
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --yes)
+      assume_yes="true"
+      shift
+      ;;
+    --skip-drain)
+      skip_drain="true"
+      shift
+      ;;
+    --delete-node)
+      delete_node="true"
+      shift
+      ;;
+    --out-dir)
+      out_dir="$2"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    -*)
+      echo "Unknown option: $1" >&2
+      usage
+      exit 1
+      ;;
+    *)
+      if [ -z "${node}" ]; then
+        node="$1"
+      else
+        echo "Unexpected argument: $1" >&2
+        usage
+        exit 1
+      fi
+      shift
+      ;;
+  esac
+done
+
+if [ -z "${node}" ]; then
+  echo "Node name is required" >&2
+  usage
+  exit 1
+fi
+
+if ! kubectl get node "${node}" >/dev/null 2>&1; then
+  echo "Node ${node} not found in cluster API" >&2
+  exit 1
+fi
+
+if [ "${assume_yes}" != "true" ]; then
+  echo "About to prepare recovery workflow for node: ${node}"
+  echo "skip_drain=${skip_drain} delete_node=${delete_node}"
+  read -r -p "Type the node name to continue: " confirm
+  if [ "${confirm}" != "${node}" ]; then
+    echo "Confirmation did not match node name; aborting."
+    exit 1
+  fi
+fi
+
+timestamp="$(date +%Y%m%d-%H%M%S)"
+artifacts_dir="${out_dir}/${node}-${timestamp}"
+mkdir -p "${artifacts_dir}"
+
+echo "Saving node and workload artifacts to ${artifacts_dir}"
+kubectl get node "${node}" -o json > "${artifacts_dir}/node.json"
+kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt"
+kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt"
+
+jq -r '
+  .metadata.labels
+  | to_entries[]
+  | select(
+      .key != "kubernetes.io/hostname"
+      and .key != "beta.kubernetes.io/hostname"
+      and .key != "node.kubernetes.io/instance-type"
+      and .key != "beta.kubernetes.io/instance-type"
+      and (.key | startswith("kubernetes.io/") | not)
+      and (.key | startswith("beta.kubernetes.io/") | not)
+      and (.key | startswith("node.kubernetes.io/") | not)
+    )
+  | "kubectl label node <replacement-node> " + .key + "=" + .value + " --overwrite"
+' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh"
+
+jq -r '
+  (.spec.taints // [])[]
+  | "kubectl taint node <replacement-node> "
+    + .key
+    + (if .value then "=" + .value else "" end)
+    + ":"
+    + .effect
+    + " --overwrite"
+' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh"
+
+chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh"
+
+if [ "${skip_drain}" != "true" ]; then
+  echo "Cordoning ${node}"
+  kubectl cordon "${node}" || true
+
+  echo "Draining ${node}"
+  if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then
+    echo "Standard drain failed; retrying with --force"
+    if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then
+      echo "Force drain failed; retrying with --disable-eviction"
+      kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction
+    fi
+  fi
+fi
+
+if [ "${delete_node}" = "true" ]; then
+  echo "Deleting node object ${node}"
+  kubectl delete node "${node}" || true
+fi
+
+cat <<NEXT
+Recovery prep complete for ${node}.
+Artifacts: ${artifacts_dir}
+
+Next steps:
+1) Reimage/reprovision replacement host.
+2) Rejoin k3s and wait for node Ready.
+3) Reapply labels: ${artifacts_dir}/restore-labels.sh
+4) Reapply taints: ${artifacts_dir}/restore-taints.sh
+5) Validate pods and uncordon replacement when ready.
+NEXT
--- a/services/comms/knowledge/software/metis.md
+++ b/services/comms/knowledge/software/metis.md
@ -1,5 +1,19 @@
 # Metis (node recovery)

+## Fast path (SD/media failure)
+1. Run `scripts/node_recover.sh <node> --yes --delete-node` from `titan-iac`.
+2. Reimage/reprovision the replacement host.
+3. Rejoin the replacement node to k3s.
+4. Reapply labels and taints from generated artifacts:
+   - `artifacts/node-recovery/<node>-<timestamp>/restore-labels.sh`
+   - `artifacts/node-recovery/<node>-<timestamp>/restore-taints.sh`
+5. Verify workloads, then uncordon the replacement node.
+
+### Notes
+- `node_recover.sh` snapshots node labels/taints and current pod placement before drain.
+- Use `--skip-drain` for a dead/unreachable node where only artifact capture is possible.
+- Use `--delete-node` after drain (or for hard-dead nodes) so replacement join is clean.
+
 ## Node classes (current map)
 - rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
 - rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
--- a/services/logging/data-prepper-helmrelease.yaml
+++ b/services/logging/data-prepper-helmrelease.yaml
@ -40,15 +40,25 @@ spec:
        memory: "512Mi"
      limits:
        memory: "1Gi"
-    nodeSelector:
-      node-role.kubernetes.io/worker: "true"
-      hardware: rpi5
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:
          nodeSelectorTerms:
+            - matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
            - matchExpressions:
                - key: hardware
                  operator: In
                  values:
                    - rpi5
+        preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            preference:
+              matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
--- a/services/logging/fluent-bit-helmrelease.yaml
+++ b/services/logging/fluent-bit-helmrelease.yaml
@ -51,7 +51,7 @@ spec:
      service: |
        [SERVICE]
            Flush 1
-            Log_Level info
+            Log_Level warn
            Daemon Off
            Parsers_File parsers.conf
            Parsers_File custom_parsers.conf
@ -74,7 +74,7 @@ spec:
            Refresh_Interval 10
            Rotate_Wait 30
            Inotify_Watcher false
-            Read_from_Head On
+            Read_from_Head Off
            DB /var/lib/fluent-bit/kube.db
            storage.type filesystem

@ -82,7 +82,7 @@ spec:
            Name systemd
            Tag journald.*
            Path /var/log/journal
-            Read_From_Tail Off
+            Read_From_Tail On
            DB /var/lib/fluent-bit/systemd.db
            storage.type filesystem
      filters: |
@ -107,7 +107,7 @@ spec:
            Logstash_Prefix kube
            Replace_Dots On
            Suppress_Type_Name On
-            Retry_Limit False
+            Retry_Limit 10

        [OUTPUT]
            Name es
@ -119,4 +119,4 @@ spec:
            Logstash_Prefix journald
            Replace_Dots On
            Suppress_Type_Name On
-            Retry_Limit False
+            Retry_Limit 10
--- a/services/logging/node-log-rotation-daemonset.yaml
+++ b/services/logging/node-log-rotation-daemonset.yaml
@ -24,7 +24,17 @@ spec:
          operator: Exists
          effect: NoSchedule
      nodeSelector:
-        hardware: rpi5
+        node-role.kubernetes.io/worker: "true"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: hardware
+                    operator: In
+                    values:
+                      - rpi4
+                      - rpi5
      containers:
        - name: node-log-rotation
          image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
--- a/services/logging/opensearch-dashboards-helmrelease.yaml
+++ b/services/logging/opensearch-dashboards-helmrelease.yaml
@ -37,15 +37,25 @@ spec:
      limits:
        cpu: "200m"
        memory: "512Mi"
-    nodeSelector:
-      node-role.kubernetes.io/worker: "true"
-      hardware: rpi5
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:
          nodeSelectorTerms:
+            - matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
            - matchExpressions:
                - key: hardware
                  operator: In
                  values:
                    - rpi5
+        preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            preference:
+              matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
--- a/services/logging/opensearch-helmrelease.yaml
+++ b/services/logging/opensearch-helmrelease.yaml
@ -40,17 +40,27 @@ spec:
        discovery.type: single-node
        plugins.security.disabled: true
        node.store.allow_mmap: false
-    nodeSelector:
-      node-role.kubernetes.io/worker: "true"
-      hardware: rpi5
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:
          nodeSelectorTerms:
+            - matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
            - matchExpressions:
                - key: hardware
                  operator: In
                  values:
                    - rpi5
+        preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            preference:
+              matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
    sysctlInit:
      enabled: true
--- a/services/logging/otel-collector-helmrelease.yaml
+++ b/services/logging/otel-collector-helmrelease.yaml
@ -76,15 +76,25 @@ spec:
        memory: "256Mi"
      limits:
        memory: "512Mi"
-    nodeSelector:
-      node-role.kubernetes.io/worker: "true"
-      hardware: rpi5
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:
          nodeSelectorTerms:
+            - matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
            - matchExpressions:
                - key: hardware
                  operator: In
                  values:
                    - rpi5
+        preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            preference:
+              matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
--- a/services/logging/scripts/node_log_rotation.sh
+++ b/services/logging/scripts/node_log_rotation.sh
@ -12,39 +12,77 @@ k3s_agent_dropin="/host/etc/systemd/system/k3s-agent.service.d/99-logging.conf"
 k3s_image_gc_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf"
 k3s_agent_image_gc_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf"

-if [ ! -f "${journald_dropin}" ]; then
-  mkdir -p "$(dirname "${journald_dropin}")"
-  printf "[Journal]\nStorage=volatile\nRuntimeMaxUse=200M\nRuntimeKeepFree=512M\nMaxFileSec=1h\n" > "${journald_dropin}"
-  changed=1
-  journald_changed=1
+ensure_dropin() {
+  local path="$1"
+  local owner="$2"
+  local new_content="$3"
+  local current=""
+  if [ -f "${path}" ]; then
+    current="$(cat "${path}" || true)"
+  fi
+  if [ "${current}" != "${new_content}" ]; then
+    mkdir -p "$(dirname "${path}")"
+    printf "%s\n" "${new_content}" > "${path}"
+    changed=1
+    case "${owner}" in
+      journald)
+        journald_changed=1
+        ;;
+      k3s)
+        k3s_changed=1
+        ;;
+      k3s-agent)
+        k3s_agent_changed=1
+        ;;
+    esac
+  fi
+}
+
+ensure_dropin \
+  "${journald_dropin}" \
+  "journald" \
+  "[Journal]
+Storage=volatile
+RuntimeMaxUse=200M
+RuntimeKeepFree=512M
+MaxFileSec=1h"
+
+if [ -f "/host/etc/systemd/system/k3s.service" ]; then
+  ensure_dropin \
+    "${k3s_dropin}" \
+    "k3s" \
+    "[Service]
+Environment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"
+Environment=\"K3S_KUBELET_ARG=container-log-max-files=2\""
 fi

-if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_dropin}" ]; then
-  mkdir -p "$(dirname "${k3s_dropin}")"
-  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_dropin}"
-  changed=1
-  k3s_changed=1
+if [ -f "/host/etc/systemd/system/k3s.service" ]; then
+  ensure_dropin \
+    "${k3s_image_gc_dropin}" \
+    "k3s" \
+    "[Service]
+Environment=\"K3S_KUBELET_ARG=image-gc-high-threshold=65\"
+Environment=\"K3S_KUBELET_ARG=image-gc-low-threshold=50\"
+Environment=\"K3S_KUBELET_ARG=image-gc-minimum-available=8Gi\""
 fi

-if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_image_gc_dropin}" ]; then
-  mkdir -p "$(dirname "${k3s_image_gc_dropin}")"
-  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_image_gc_dropin}"
-  changed=1
-  k3s_changed=1
+if [ -f "/host/etc/systemd/system/k3s-agent.service" ]; then
+  ensure_dropin \
+    "${k3s_agent_dropin}" \
+    "k3s-agent" \
+    "[Service]
+Environment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"
+Environment=\"K3S_KUBELET_ARG=container-log-max-files=2\""
 fi

-if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_dropin}" ]; then
-  mkdir -p "$(dirname "${k3s_agent_dropin}")"
-  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_agent_dropin}"
-  changed=1
-  k3s_agent_changed=1
-fi
-
-if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_image_gc_dropin}" ]; then
-  mkdir -p "$(dirname "${k3s_agent_image_gc_dropin}")"
-  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_agent_image_gc_dropin}"
-  changed=1
-  k3s_agent_changed=1
+if [ -f "/host/etc/systemd/system/k3s-agent.service" ]; then
+  ensure_dropin \
+    "${k3s_agent_image_gc_dropin}" \
+    "k3s-agent" \
+    "[Service]
+Environment=\"K3S_KUBELET_ARG=image-gc-high-threshold=65\"
+Environment=\"K3S_KUBELET_ARG=image-gc-low-threshold=50\"
+Environment=\"K3S_KUBELET_ARG=image-gc-minimum-available=8Gi\""
 fi

 if [ "${changed}" -eq 1 ]; then
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@ -18,6 +18,7 @@ spec:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8080"
        prometheus.io/path: "/metrics"
+        maintenance.bstein.dev/restart-rev: "20260207-2"
        vault.hashicorp.com/agent-inject: "true"
        vault.hashicorp.com/role: "maintenance"
        vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
@ -105,7 +106,7 @@ spec:
        node-role.kubernetes.io/worker: "true"
      containers:
        - name: ariadne
-          image: registry.bstein.dev/bstein/ariadne:0.1.0-0
+          image: registry.bstein.dev/bstein/ariadne:latest
          imagePullPolicy: Always
          command: ["/bin/sh", "-c"]
          args:
@ -285,7 +286,7 @@ spec:
            - name: ARIADNE_SCHEDULE_MAILU_SYNC
              value: "30 4 * * *"
            - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
-              value: "0 5 * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
              value: "*/5 * * * *"
            - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
@ -293,23 +294,23 @@ spec:
            - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
              value: "0 * * * *"
            - name: ARIADNE_SCHEDULE_WGER_USER_SYNC
-              value: "0 5 * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_WGER_ADMIN
              value: "15 3 * * *"
            - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
-              value: "0 6 * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_FIREFLY_CRON
              value: "0 3 * * *"
            - name: ARIADNE_SCHEDULE_POD_CLEANER
-              value: "0 * * * *"
+              value: "*/30 * * * *"
            - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
              value: "23 3 * * *"
            - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
-              value: "30 4 * * 0"
+              value: "0 */4 * * *"
            - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
-              value: "0 * * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_VAULT_OIDC
-              value: "0 * * * *"
+              value: "*/15 * * * *"
            - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
              value: "*/5 * * * *"
            - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
@ -319,9 +320,9 @@ spec:
            - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
              value: "*/10 * * * *"
            - name: ARIADNE_SCHEDULE_CLUSTER_STATE
-              value: "*/15 * * * *"
+              value: "*/10 * * * *"
            - name: ARIADNE_CLUSTER_STATE_KEEP
-              value: "168"
+              value: "720"
            - name: WELCOME_EMAIL_ENABLED
              value: "true"
            - name: K8S_API_TIMEOUT_SEC
@ -330,6 +331,8 @@ spec:
              value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
            - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
              value: "5"
+            - name: ARIADNE_ALERTMANAGER_URL
+              value: http://alertmanager.monitoring.svc.cluster.local
            - name: OPENSEARCH_URL
              value: http://opensearch-master.logging.svc.cluster.local:9200
            - name: OPENSEARCH_LIMIT_BYTES
--- a/services/maintenance/node-image-sweeper-daemonset.yaml
+++ b/services/maintenance/node-image-sweeper-daemonset.yaml
@ -33,17 +33,15 @@ spec:
          command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
          env:
            - name: SWEEP_INTERVAL_SEC
-              value: "21600"
+              value: "7200"
            - name: HIGH_USAGE_PERCENT
              value: "70"
            - name: EMERGENCY_USAGE_PERCENT
              value: "80"
-            - name: BASE_THRESHOLD_DAYS
-              value: "14"
-            - name: HIGH_USAGE_THRESHOLD_DAYS
-              value: "3"
            - name: LOG_RETENTION_DAYS
              value: "7"
+            - name: ORPHAN_POD_RETENTION_DAYS
+              value: "3"
            - name: JOURNAL_MAX_SIZE
              value: "200M"
          securityContext:
--- a/services/maintenance/scripts/node_image_sweeper.sh
+++ b/services/maintenance/scripts/node_image_sweeper.sh
@ -3,96 +3,71 @@ set -eu

 ONE_SHOT=${ONE_SHOT:-false}
 SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
-BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
-HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
 HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
 EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
 LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
+ORPHAN_POD_RETENTION_DAYS=${ORPHAN_POD_RETENTION_DAYS:-3}
 JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
-SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
+
+cleanup_orphaned_hdd_pod_logs() {
+  if [ ! -d /host/var/log.hdd/pods ]; then
+    return 0
+  fi
+
+  ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY'
+import os
+import shutil
+import time
+
+hdd_pods = "/host/var/log.hdd/pods"
+active_pods = "/host/var/log/pods"
+retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3"))
+cutoff = time.time() - (retention_days * 86400)
+
+try:
+    active_names = set(os.listdir(active_pods))
+except Exception:
+    active_names = set()
+
+try:
+    hdd_names = os.listdir(hdd_pods)
+except Exception:
+    hdd_names = []
+
+for name in hdd_names:
+    path = os.path.join(hdd_pods, name)
+    if not os.path.isdir(path):
+        continue
+    if name in active_names:
+        continue
+    try:
+        mtime = os.path.getmtime(path)
+    except Exception:
+        continue
+    if mtime > cutoff:
+        continue
+    print(path)
+    shutil.rmtree(path, ignore_errors=True)
+PY
+}

 sweep_once() {
  usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
-  threshold_days="${BASE_THRESHOLD_DAYS}"
+
+  # crictl image metadata frequently omits createdAt on this cluster; prune by
+  # runtime reachability whenever rootfs crosses pressure thresholds.
  if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
-    threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
+    chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
  fi

-  cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
-import os
-import time
+  cleanup_orphaned_hdd_pod_logs

-days = int(os.environ.get("THRESHOLD_DAYS", "14"))
-print(int(time.time()) - days * 86400)
-PY
-)
+  if [ -d /host/var/log.hdd/pods ]; then
+    find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
+  fi

-  RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
-  IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
-
-  prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
-import json
-import os
-import sys
-import time
-
-try:
-    data = json.load(sys.stdin)
-except Exception:
-    print("", end="")
-    sys.exit(0)
-
-cutoff = int(os.environ.get("CUTOFF", "0"))
-running = set(os.environ.get("RUNNING", "").split())
-skip = os.environ.get("SKIP", "").split()
-now = int(time.time())
-prune = []
-
-
-def is_skip(tags):
-    if not tags:
-        return False
-    for t in tags:
-        for prefix in skip:
-            if prefix and t.startswith(prefix):
-                return True
-    return False
-
-
-for img in data.get("images", []):
-    image_id = img.get("id", "")
-    if not image_id:
-        continue
-    if image_id in running:
-        continue
-    tags = img.get("repoTags") or []
-    if is_skip(tags):
-        continue
-    created = img.get("createdAt") or 0
-    try:
-        created = int(str(created)) // 1000000000
-    except Exception:
-        created = 0
-    if created and created > now:
-        created = now
-    if cutoff and created and created < cutoff:
-        prune.append(image_id)
-
-seen = set()
-for p in prune:
-    if p in seen:
-        continue
-    seen.add(p)
-    print(p)
-PY
-)
-
-  if [ -n "${prune_list}" ]; then
-    printf "%s" "${prune_list}" | while read -r image_id; do
-      if [ -n "${image_id}" ]; then
-        chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
-      fi
-    done
+  if [ -d /host/var/log.hdd/containers ]; then
+    find /host/var/log.hdd/containers -xtype l -print -delete 2>/dev/null || true
  fi

  find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
@ -100,9 +75,11 @@ PY

  if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
    # Emergency pass for rootfs pressure on SD-backed nodes.
+    chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
    chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
    find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
    find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
+    find /host/var/log.hdd -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
    chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
  fi
 }
--- a/services/monitoring/grafana-alerting-config.yaml
+++ b/services/monitoring/grafana-alerting-config.yaml
@ -303,8 +303,56 @@ data:
              summary: "node-image-sweeper not fully ready"
            labels:
              severity: warning
+          - uid: logging-node-log-rotation-not-ready
+            title: "Node log rotation guardrails not ready"
+            condition: C
+            for: "10m"
+            data:
+              - refId: A
+                relativeTimeRange:
+                  from: 600
+                  to: 0
+                datasourceUid: atlas-vm
+                model:
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  expr: kube_daemonset_status_number_ready{namespace="logging",daemonset="node-log-rotation"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="logging",daemonset="node-log-rotation"}
+                  legendFormat: '{{daemonset}}'
+                  datasource:
+                    type: prometheus
+                    uid: atlas-vm
+              - refId: B
+                datasourceUid: __expr__
+                model:
+                  expression: A
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  reducer: last
+                  type: reduce
+              - refId: C
+                datasourceUid: __expr__
+                model:
+                  expression: B
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  type: threshold
+                  conditions:
+                    - evaluator:
+                        params: [1]
+                        type: lt
+                      operator:
+                        type: and
+                      reducer:
+                        type: last
+                      type: query
+            noDataState: NoData
+            execErrState: Error
+            annotations:
+              summary: "node-log-rotation is not fully ready"
+            labels:
+              severity: warning
          - uid: maint-ariadne-image-sweeper-stale
-            title: "Ariadne image sweeper stale (schedule >8d)"
+            title: "Ariadne image sweeper stale (schedule >24h)"
            condition: C
            for: "5m"
            data:
@ -338,7 +386,7 @@ data:
                  type: threshold
                  conditions:
                    - evaluator:
-                        params: [691200]
+                        params: [86400]
                        type: gt
                      operator:
                        type: and
@ -348,7 +396,7 @@ data:
            noDataState: OK
            execErrState: Error
            annotations:
-              summary: "Ariadne image sweeper stale >8d since last success"
+              summary: "Ariadne image sweeper stale >24h since last success"
            labels:
              severity: warning
          - uid: maint-cron-stale