diff --git a/scripts/node_recover.sh b/scripts/node_recover.sh
new file mode 100755
index 00000000..44e656f3
--- /dev/null
+++ b/scripts/node_recover.sh
@@ -0,0 +1,163 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat <<USAGE
+Usage: scripts/node_recover.sh <node-name> [options]
+
+Options:
+  --yes             Skip confirmation prompt
+  --skip-drain      Do not cordon/drain; only capture recovery artifacts
+  --delete-node     Delete Node object after drain (for hard-dead node replacement)
+  --out-dir <dir>   Recovery artifact directory (default: ./artifacts/node-recovery)
+  -h, --help        Show this help
+USAGE
+}
+
+if ! command -v kubectl >/dev/null 2>&1; then
+  echo "kubectl is required" >&2
+  exit 1
+fi
+if ! command -v jq >/dev/null 2>&1; then
+  echo "jq is required" >&2
+  exit 1
+fi
+
+if [ "$#" -lt 1 ]; then
+  usage
+  exit 1
+fi
+
+node=""
+assume_yes="false"
+skip_drain="false"
+delete_node="false"
+out_dir="./artifacts/node-recovery"
+
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --yes)
+      assume_yes="true"
+      shift
+      ;;
+    --skip-drain)
+      skip_drain="true"
+      shift
+      ;;
+    --delete-node)
+      delete_node="true"
+      shift
+      ;;
+    --out-dir)
+      out_dir="$2"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    -*)
+      echo "Unknown option: $1" >&2
+      usage
+      exit 1
+      ;;
+    *)
+      if [ -z "${node}" ]; then
+        node="$1"
+      else
+        echo "Unexpected argument: $1" >&2
+        usage
+        exit 1
+      fi
+      shift
+      ;;
+  esac
+done
+
+if [ -z "${node}" ]; then
+  echo "Node name is required" >&2
+  usage
+  exit 1
+fi
+
+if ! kubectl get node "${node}" >/dev/null 2>&1; then
+  echo "Node ${node} not found in cluster API" >&2
+  exit 1
+fi
+
+if [ "${assume_yes}" != "true" ]; then
+  echo "About to prepare recovery workflow for node: ${node}"
+  echo "skip_drain=${skip_drain} delete_node=${delete_node}"
+  read -r -p "Type the node name to continue: " confirm
+  if [ "${confirm}" != "${node}" ]; then
+    echo "Confirmation did not match node name; aborting."
+    exit 1
+  fi
+fi
+
+timestamp="$(date +%Y%m%d-%H%M%S)"
+artifacts_dir="${out_dir}/${node}-${timestamp}"
+mkdir -p "${artifacts_dir}"
+
+echo "Saving node and workload artifacts to ${artifacts_dir}"
+kubectl get node "${node}" -o json > "${artifacts_dir}/node.json"
+kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt"
+kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt"
+
+jq -r '
+  .metadata.labels
+  | to_entries[]
+  | select(
+      .key != "kubernetes.io/hostname"
+      and .key != "beta.kubernetes.io/hostname"
+      and .key != "node.kubernetes.io/instance-type"
+      and .key != "beta.kubernetes.io/instance-type"
+      and (.key | startswith("kubernetes.io/") | not)
+      and (.key | startswith("beta.kubernetes.io/") | not)
+      and (.key | startswith("node.kubernetes.io/") | not)
+    )
+  | "kubectl label node <replacement-node> " + .key + "=" + .value + " --overwrite"
+' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh"
+
+jq -r '
+  (.spec.taints // [])[]
+  | "kubectl taint node <replacement-node> "
+    + .key
+    + (if .value then "=" + .value else "" end)
+    + ":"
+    + .effect
+    + " --overwrite"
+' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh"
+
+chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh"
+
+if [ "${skip_drain}" != "true" ]; then
+  echo "Cordoning ${node}"
+  kubectl cordon "${node}" || true
+
+  echo "Draining ${node}"
+  if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then
+    echo "Standard drain failed; retrying with --force"
+    if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then
+      echo "Force drain failed; retrying with --disable-eviction"
+      kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction
+    fi
+  fi
+fi
+
+if [ "${delete_node}" = "true" ]; then
+  echo "Deleting node object ${node}"
+  kubectl delete node "${node}" || true
+fi
+
+cat <<NEXT
+Recovery prep complete for ${node}.
+Artifacts: ${artifacts_dir}
+
+Next steps:
+1) Reimage/reprovision replacement host.
+2) Rejoin k3s and wait for node Ready.
+3) Reapply labels: ${artifacts_dir}/restore-labels.sh
+4) Reapply taints: ${artifacts_dir}/restore-taints.sh
+5) Validate pods and uncordon replacement when ready.
+NEXT
diff --git a/services/comms/knowledge/software/metis.md b/services/comms/knowledge/software/metis.md
index 7ca3b399..192e266e 100644
--- a/services/comms/knowledge/software/metis.md
+++ b/services/comms/knowledge/software/metis.md
@@ -1,5 +1,19 @@
 # Metis (node recovery)
 
+## Fast path (SD/media failure)
+1. Run `scripts/node_recover.sh <node> --yes --delete-node` from `titan-iac`.
+2. Reimage/reprovision the replacement host.
+3. Rejoin the replacement node to k3s.
+4. Reapply labels and taints from generated artifacts:
+   - `artifacts/node-recovery/<node>-<timestamp>/restore-labels.sh`
+   - `artifacts/node-recovery/<node>-<timestamp>/restore-taints.sh`
+5. Verify workloads, then uncordon the replacement node.
+
+### Notes
+- `node_recover.sh` snapshots node labels/taints and current pod placement before drain.
+- Use `--skip-drain` for a dead/unreachable node where only artifact capture is possible.
+- Use `--delete-node` after drain (or for hard-dead nodes) so replacement join is clean.
+
 ## Node classes (current map)
 - rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
 - rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
diff --git a/services/logging/data-prepper-helmrelease.yaml b/services/logging/data-prepper-helmrelease.yaml
index 1c0bc45d..acf9a352 100644
--- a/services/logging/data-prepper-helmrelease.yaml
+++ b/services/logging/data-prepper-helmrelease.yaml
@@ -40,15 +40,25 @@ spec:
         memory: "512Mi"
       limits:
         memory: "1Gi"
-    nodeSelector:
-      node-role.kubernetes.io/worker: "true"
-      hardware: rpi5
     affinity:
       nodeAffinity:
         requiredDuringSchedulingIgnoredDuringExecution:
           nodeSelectorTerms:
+            - matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
             - matchExpressions:
                 - key: hardware
                   operator: In
                   values:
                     - rpi5
+        preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            preference:
+              matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
diff --git a/services/logging/fluent-bit-helmrelease.yaml b/services/logging/fluent-bit-helmrelease.yaml
index e16890c9..a330340e 100644
--- a/services/logging/fluent-bit-helmrelease.yaml
+++ b/services/logging/fluent-bit-helmrelease.yaml
@@ -51,7 +51,7 @@ spec:
       service: |
         [SERVICE]
             Flush 1
-            Log_Level info
+            Log_Level warn
             Daemon Off
             Parsers_File parsers.conf
             Parsers_File custom_parsers.conf
@@ -74,7 +74,7 @@ spec:
             Refresh_Interval 10
             Rotate_Wait 30
             Inotify_Watcher false
-            Read_from_Head On
+            Read_from_Head Off
             DB /var/lib/fluent-bit/kube.db
             storage.type filesystem
 
@@ -82,7 +82,7 @@ spec:
             Name systemd
             Tag journald.*
             Path /var/log/journal
-            Read_From_Tail Off
+            Read_From_Tail On
             DB /var/lib/fluent-bit/systemd.db
             storage.type filesystem
       filters: |
@@ -107,7 +107,7 @@ spec:
             Logstash_Prefix kube
             Replace_Dots On
             Suppress_Type_Name On
-            Retry_Limit False
+            Retry_Limit 10
 
         [OUTPUT]
             Name es
@@ -119,4 +119,4 @@ spec:
             Logstash_Prefix journald
             Replace_Dots On
             Suppress_Type_Name On
-            Retry_Limit False
+            Retry_Limit 10
diff --git a/services/logging/node-log-rotation-daemonset.yaml b/services/logging/node-log-rotation-daemonset.yaml
index f6a672c3..b7753c36 100644
--- a/services/logging/node-log-rotation-daemonset.yaml
+++ b/services/logging/node-log-rotation-daemonset.yaml
@@ -24,7 +24,17 @@ spec:
           operator: Exists
           effect: NoSchedule
       nodeSelector:
-        hardware: rpi5
+        node-role.kubernetes.io/worker: "true"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: hardware
+                    operator: In
+                    values:
+                      - rpi4
+                      - rpi5
       containers:
         - name: node-log-rotation
           image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
diff --git a/services/logging/opensearch-dashboards-helmrelease.yaml b/services/logging/opensearch-dashboards-helmrelease.yaml
index 85f859e7..bf6c41fc 100644
--- a/services/logging/opensearch-dashboards-helmrelease.yaml
+++ b/services/logging/opensearch-dashboards-helmrelease.yaml
@@ -37,15 +37,25 @@ spec:
       limits:
         cpu: "200m"
         memory: "512Mi"
-    nodeSelector:
-      node-role.kubernetes.io/worker: "true"
-      hardware: rpi5
     affinity:
       nodeAffinity:
         requiredDuringSchedulingIgnoredDuringExecution:
           nodeSelectorTerms:
+            - matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
             - matchExpressions:
                 - key: hardware
                   operator: In
                   values:
                     - rpi5
+        preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            preference:
+              matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
diff --git a/services/logging/opensearch-helmrelease.yaml b/services/logging/opensearch-helmrelease.yaml
index 3d7dd6b7..c43a2b9e 100644
--- a/services/logging/opensearch-helmrelease.yaml
+++ b/services/logging/opensearch-helmrelease.yaml
@@ -40,17 +40,27 @@ spec:
         discovery.type: single-node
         plugins.security.disabled: true
         node.store.allow_mmap: false
-    nodeSelector:
-      node-role.kubernetes.io/worker: "true"
-      hardware: rpi5
     affinity:
       nodeAffinity:
         requiredDuringSchedulingIgnoredDuringExecution:
           nodeSelectorTerms:
+            - matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
             - matchExpressions:
                 - key: hardware
                   operator: In
                   values:
                     - rpi5
+        preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            preference:
+              matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
     sysctlInit:
       enabled: true
diff --git a/services/logging/otel-collector-helmrelease.yaml b/services/logging/otel-collector-helmrelease.yaml
index c24682f5..b1bcc25a 100644
--- a/services/logging/otel-collector-helmrelease.yaml
+++ b/services/logging/otel-collector-helmrelease.yaml
@@ -76,15 +76,25 @@ spec:
         memory: "256Mi"
       limits:
         memory: "512Mi"
-    nodeSelector:
-      node-role.kubernetes.io/worker: "true"
-      hardware: rpi5
     affinity:
       nodeAffinity:
         requiredDuringSchedulingIgnoredDuringExecution:
           nodeSelectorTerms:
+            - matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
             - matchExpressions:
                 - key: hardware
                   operator: In
                   values:
                     - rpi5
+        preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            preference:
+              matchExpressions:
+                - key: jetson
+                  operator: In
+                  values:
+                    - "true"
diff --git a/services/logging/scripts/node_log_rotation.sh b/services/logging/scripts/node_log_rotation.sh
index 534806ff..c12847e0 100644
--- a/services/logging/scripts/node_log_rotation.sh
+++ b/services/logging/scripts/node_log_rotation.sh
@@ -12,39 +12,77 @@ k3s_agent_dropin="/host/etc/systemd/system/k3s-agent.service.d/99-logging.conf"
 k3s_image_gc_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf"
 k3s_agent_image_gc_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf"
 
-if [ ! -f "${journald_dropin}" ]; then
-  mkdir -p "$(dirname "${journald_dropin}")"
-  printf "[Journal]\nStorage=volatile\nRuntimeMaxUse=200M\nRuntimeKeepFree=512M\nMaxFileSec=1h\n" > "${journald_dropin}"
-  changed=1
-  journald_changed=1
+ensure_dropin() {
+  local path="$1"
+  local owner="$2"
+  local new_content="$3"
+  local current=""
+  if [ -f "${path}" ]; then
+    current="$(cat "${path}" || true)"
+  fi
+  if [ "${current}" != "${new_content}" ]; then
+    mkdir -p "$(dirname "${path}")"
+    printf "%s\n" "${new_content}" > "${path}"
+    changed=1
+    case "${owner}" in
+      journald)
+        journald_changed=1
+        ;;
+      k3s)
+        k3s_changed=1
+        ;;
+      k3s-agent)
+        k3s_agent_changed=1
+        ;;
+    esac
+  fi
+}
+
+ensure_dropin \
+  "${journald_dropin}" \
+  "journald" \
+  "[Journal]
+Storage=volatile
+RuntimeMaxUse=200M
+RuntimeKeepFree=512M
+MaxFileSec=1h"
+
+if [ -f "/host/etc/systemd/system/k3s.service" ]; then
+  ensure_dropin \
+    "${k3s_dropin}" \
+    "k3s" \
+    "[Service]
+Environment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"
+Environment=\"K3S_KUBELET_ARG=container-log-max-files=2\""
 fi
 
-if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_dropin}" ]; then
-  mkdir -p "$(dirname "${k3s_dropin}")"
-  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_dropin}"
-  changed=1
-  k3s_changed=1
+if [ -f "/host/etc/systemd/system/k3s.service" ]; then
+  ensure_dropin \
+    "${k3s_image_gc_dropin}" \
+    "k3s" \
+    "[Service]
+Environment=\"K3S_KUBELET_ARG=image-gc-high-threshold=65\"
+Environment=\"K3S_KUBELET_ARG=image-gc-low-threshold=50\"
+Environment=\"K3S_KUBELET_ARG=image-gc-minimum-available=8Gi\""
 fi
 
-if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_image_gc_dropin}" ]; then
-  mkdir -p "$(dirname "${k3s_image_gc_dropin}")"
-  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_image_gc_dropin}"
-  changed=1
-  k3s_changed=1
+if [ -f "/host/etc/systemd/system/k3s-agent.service" ]; then
+  ensure_dropin \
+    "${k3s_agent_dropin}" \
+    "k3s-agent" \
+    "[Service]
+Environment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"
+Environment=\"K3S_KUBELET_ARG=container-log-max-files=2\""
 fi
 
-if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_dropin}" ]; then
-  mkdir -p "$(dirname "${k3s_agent_dropin}")"
-  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_agent_dropin}"
-  changed=1
-  k3s_agent_changed=1
-fi
-
-if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_image_gc_dropin}" ]; then
-  mkdir -p "$(dirname "${k3s_agent_image_gc_dropin}")"
-  printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_agent_image_gc_dropin}"
-  changed=1
-  k3s_agent_changed=1
+if [ -f "/host/etc/systemd/system/k3s-agent.service" ]; then
+  ensure_dropin \
+    "${k3s_agent_image_gc_dropin}" \
+    "k3s-agent" \
+    "[Service]
+Environment=\"K3S_KUBELET_ARG=image-gc-high-threshold=65\"
+Environment=\"K3S_KUBELET_ARG=image-gc-low-threshold=50\"
+Environment=\"K3S_KUBELET_ARG=image-gc-minimum-available=8Gi\""
 fi
 
 if [ "${changed}" -eq 1 ]; then
diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml
index fce1ded5..e5eacf6f 100644
--- a/services/maintenance/ariadne-deployment.yaml
+++ b/services/maintenance/ariadne-deployment.yaml
@@ -18,6 +18,7 @@ spec:
         prometheus.io/scrape: "true"
         prometheus.io/port: "8080"
         prometheus.io/path: "/metrics"
+        maintenance.bstein.dev/restart-rev: "20260207-2"
         vault.hashicorp.com/agent-inject: "true"
         vault.hashicorp.com/role: "maintenance"
         vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
@@ -105,7 +106,7 @@ spec:
         node-role.kubernetes.io/worker: "true"
       containers:
         - name: ariadne
-          image: registry.bstein.dev/bstein/ariadne:0.1.0-0
+          image: registry.bstein.dev/bstein/ariadne:latest
           imagePullPolicy: Always
           command: ["/bin/sh", "-c"]
           args:
@@ -285,7 +286,7 @@ spec:
             - name: ARIADNE_SCHEDULE_MAILU_SYNC
               value: "30 4 * * *"
             - name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
-              value: "0 5 * * *"
+              value: "*/15 * * * *"
             - name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
               value: "*/5 * * * *"
             - name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
@@ -293,23 +294,23 @@ spec:
             - name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
               value: "0 * * * *"
             - name: ARIADNE_SCHEDULE_WGER_USER_SYNC
-              value: "0 5 * * *"
+              value: "*/15 * * * *"
             - name: ARIADNE_SCHEDULE_WGER_ADMIN
               value: "15 3 * * *"
             - name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
-              value: "0 6 * * *"
+              value: "*/15 * * * *"
             - name: ARIADNE_SCHEDULE_FIREFLY_CRON
               value: "0 3 * * *"
             - name: ARIADNE_SCHEDULE_POD_CLEANER
-              value: "0 * * * *"
+              value: "*/30 * * * *"
             - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
               value: "23 3 * * *"
             - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
-              value: "30 4 * * 0"
+              value: "0 */4 * * *"
             - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
-              value: "0 * * * *"
+              value: "*/15 * * * *"
             - name: ARIADNE_SCHEDULE_VAULT_OIDC
-              value: "0 * * * *"
+              value: "*/15 * * * *"
             - name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
               value: "*/5 * * * *"
             - name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
@@ -319,9 +320,9 @@ spec:
             - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
               value: "*/10 * * * *"
             - name: ARIADNE_SCHEDULE_CLUSTER_STATE
-              value: "*/15 * * * *"
+              value: "*/10 * * * *"
             - name: ARIADNE_CLUSTER_STATE_KEEP
-              value: "168"
+              value: "720"
             - name: WELCOME_EMAIL_ENABLED
               value: "true"
             - name: K8S_API_TIMEOUT_SEC
@@ -330,6 +331,8 @@ spec:
               value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
             - name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
               value: "5"
+            - name: ARIADNE_ALERTMANAGER_URL
+              value: http://alertmanager.monitoring.svc.cluster.local
             - name: OPENSEARCH_URL
               value: http://opensearch-master.logging.svc.cluster.local:9200
             - name: OPENSEARCH_LIMIT_BYTES
diff --git a/services/maintenance/node-image-sweeper-daemonset.yaml b/services/maintenance/node-image-sweeper-daemonset.yaml
index 488c0605..5b03cdc4 100644
--- a/services/maintenance/node-image-sweeper-daemonset.yaml
+++ b/services/maintenance/node-image-sweeper-daemonset.yaml
@@ -33,17 +33,15 @@ spec:
           command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
           env:
             - name: SWEEP_INTERVAL_SEC
-              value: "21600"
+              value: "7200"
             - name: HIGH_USAGE_PERCENT
               value: "70"
             - name: EMERGENCY_USAGE_PERCENT
               value: "80"
-            - name: BASE_THRESHOLD_DAYS
-              value: "14"
-            - name: HIGH_USAGE_THRESHOLD_DAYS
-              value: "3"
             - name: LOG_RETENTION_DAYS
               value: "7"
+            - name: ORPHAN_POD_RETENTION_DAYS
+              value: "3"
             - name: JOURNAL_MAX_SIZE
               value: "200M"
           securityContext:
diff --git a/services/maintenance/scripts/node_image_sweeper.sh b/services/maintenance/scripts/node_image_sweeper.sh
index c2fb6da1..98eedd8f 100644
--- a/services/maintenance/scripts/node_image_sweeper.sh
+++ b/services/maintenance/scripts/node_image_sweeper.sh
@@ -3,96 +3,71 @@ set -eu
 
 ONE_SHOT=${ONE_SHOT:-false}
 SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
-BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
-HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
 HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
 EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
 LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
+ORPHAN_POD_RETENTION_DAYS=${ORPHAN_POD_RETENTION_DAYS:-3}
 JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
-SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
+
+cleanup_orphaned_hdd_pod_logs() {
+  if [ ! -d /host/var/log.hdd/pods ]; then
+    return 0
+  fi
+
+  ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY'
+import os
+import shutil
+import time
+
+hdd_pods = "/host/var/log.hdd/pods"
+active_pods = "/host/var/log/pods"
+retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3"))
+cutoff = time.time() - (retention_days * 86400)
+
+try:
+    active_names = set(os.listdir(active_pods))
+except Exception:
+    active_names = set()
+
+try:
+    hdd_names = os.listdir(hdd_pods)
+except Exception:
+    hdd_names = []
+
+for name in hdd_names:
+    path = os.path.join(hdd_pods, name)
+    if not os.path.isdir(path):
+        continue
+    if name in active_names:
+        continue
+    try:
+        mtime = os.path.getmtime(path)
+    except Exception:
+        continue
+    if mtime > cutoff:
+        continue
+    print(path)
+    shutil.rmtree(path, ignore_errors=True)
+PY
+}
 
 sweep_once() {
   usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
-  threshold_days="${BASE_THRESHOLD_DAYS}"
+
+  # crictl image metadata frequently omits createdAt on this cluster; prune by
+  # runtime reachability whenever rootfs crosses pressure thresholds.
   if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
-    threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
+    chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
   fi
 
-  cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
-import os
-import time
+  cleanup_orphaned_hdd_pod_logs
 
-days = int(os.environ.get("THRESHOLD_DAYS", "14"))
-print(int(time.time()) - days * 86400)
-PY
-)
+  if [ -d /host/var/log.hdd/pods ]; then
+    find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
+  fi
 
-  RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
-  IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
-
-  prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
-import json
-import os
-import sys
-import time
-
-try:
-    data = json.load(sys.stdin)
-except Exception:
-    print("", end="")
-    sys.exit(0)
-
-cutoff = int(os.environ.get("CUTOFF", "0"))
-running = set(os.environ.get("RUNNING", "").split())
-skip = os.environ.get("SKIP", "").split()
-now = int(time.time())
-prune = []
-
-
-def is_skip(tags):
-    if not tags:
-        return False
-    for t in tags:
-        for prefix in skip:
-            if prefix and t.startswith(prefix):
-                return True
-    return False
-
-
-for img in data.get("images", []):
-    image_id = img.get("id", "")
-    if not image_id:
-        continue
-    if image_id in running:
-        continue
-    tags = img.get("repoTags") or []
-    if is_skip(tags):
-        continue
-    created = img.get("createdAt") or 0
-    try:
-        created = int(str(created)) // 1000000000
-    except Exception:
-        created = 0
-    if created and created > now:
-        created = now
-    if cutoff and created and created < cutoff:
-        prune.append(image_id)
-
-seen = set()
-for p in prune:
-    if p in seen:
-        continue
-    seen.add(p)
-    print(p)
-PY
-)
-
-  if [ -n "${prune_list}" ]; then
-    printf "%s" "${prune_list}" | while read -r image_id; do
-      if [ -n "${image_id}" ]; then
-        chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
-      fi
-    done
+  if [ -d /host/var/log.hdd/containers ]; then
+    find /host/var/log.hdd/containers -xtype l -print -delete 2>/dev/null || true
   fi
 
   find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
@@ -100,9 +75,11 @@ PY
 
   if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
     # Emergency pass for rootfs pressure on SD-backed nodes.
+    chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
     chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
     find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
     find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
+    find /host/var/log.hdd -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
     chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
   fi
 }
diff --git a/services/monitoring/grafana-alerting-config.yaml b/services/monitoring/grafana-alerting-config.yaml
index 0bc792f4..934a0721 100644
--- a/services/monitoring/grafana-alerting-config.yaml
+++ b/services/monitoring/grafana-alerting-config.yaml
@@ -303,8 +303,56 @@ data:
               summary: "node-image-sweeper not fully ready"
             labels:
               severity: warning
+          - uid: logging-node-log-rotation-not-ready
+            title: "Node log rotation guardrails not ready"
+            condition: C
+            for: "10m"
+            data:
+              - refId: A
+                relativeTimeRange:
+                  from: 600
+                  to: 0
+                datasourceUid: atlas-vm
+                model:
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  expr: kube_daemonset_status_number_ready{namespace="logging",daemonset="node-log-rotation"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="logging",daemonset="node-log-rotation"}
+                  legendFormat: '{{daemonset}}'
+                  datasource:
+                    type: prometheus
+                    uid: atlas-vm
+              - refId: B
+                datasourceUid: __expr__
+                model:
+                  expression: A
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  reducer: last
+                  type: reduce
+              - refId: C
+                datasourceUid: __expr__
+                model:
+                  expression: B
+                  intervalMs: 60000
+                  maxDataPoints: 43200
+                  type: threshold
+                  conditions:
+                    - evaluator:
+                        params: [1]
+                        type: lt
+                      operator:
+                        type: and
+                      reducer:
+                        type: last
+                      type: query
+            noDataState: NoData
+            execErrState: Error
+            annotations:
+              summary: "node-log-rotation is not fully ready"
+            labels:
+              severity: warning
           - uid: maint-ariadne-image-sweeper-stale
-            title: "Ariadne image sweeper stale (schedule >8d)"
+            title: "Ariadne image sweeper stale (schedule >24h)"
             condition: C
             for: "5m"
             data:
@@ -338,7 +386,7 @@ data:
                   type: threshold
                   conditions:
                     - evaluator:
-                        params: [691200]
+                        params: [86400]
                         type: gt
                       operator:
                         type: and
@@ -348,7 +396,7 @@ data:
             noDataState: OK
             execErrState: Error
             annotations:
-              summary: "Ariadne image sweeper stale >8d since last success"
+              summary: "Ariadne image sweeper stale >24h since last success"
             labels:
               severity: warning
           - uid: maint-cron-stale