maintenance: harden sd-write controls and recovery workflow

This commit is contained in:
Brad Stein 2026-03-31 00:06:44 -03:00
parent 7a70c22a46
commit 03ae79df3e
13 changed files with 432 additions and 141 deletions

163
scripts/node_recover.sh Executable file
View File

@ -0,0 +1,163 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<USAGE
Usage: scripts/node_recover.sh <node-name> [options]
Options:
--yes Skip confirmation prompt
--skip-drain Do not cordon/drain; only capture recovery artifacts
--delete-node Delete Node object after drain (for hard-dead node replacement)
--out-dir <dir> Recovery artifact directory (default: ./artifacts/node-recovery)
-h, --help Show this help
USAGE
}
if ! command -v kubectl >/dev/null 2>&1; then
echo "kubectl is required" >&2
exit 1
fi
if ! command -v jq >/dev/null 2>&1; then
echo "jq is required" >&2
exit 1
fi
if [ "$#" -lt 1 ]; then
usage
exit 1
fi
node=""
assume_yes="false"
skip_drain="false"
delete_node="false"
out_dir="./artifacts/node-recovery"
while [ "$#" -gt 0 ]; do
case "$1" in
--yes)
assume_yes="true"
shift
;;
--skip-drain)
skip_drain="true"
shift
;;
--delete-node)
delete_node="true"
shift
;;
--out-dir)
out_dir="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
-*)
echo "Unknown option: $1" >&2
usage
exit 1
;;
*)
if [ -z "${node}" ]; then
node="$1"
else
echo "Unexpected argument: $1" >&2
usage
exit 1
fi
shift
;;
esac
done
if [ -z "${node}" ]; then
echo "Node name is required" >&2
usage
exit 1
fi
if ! kubectl get node "${node}" >/dev/null 2>&1; then
echo "Node ${node} not found in cluster API" >&2
exit 1
fi
if [ "${assume_yes}" != "true" ]; then
echo "About to prepare recovery workflow for node: ${node}"
echo "skip_drain=${skip_drain} delete_node=${delete_node}"
read -r -p "Type the node name to continue: " confirm
if [ "${confirm}" != "${node}" ]; then
echo "Confirmation did not match node name; aborting."
exit 1
fi
fi
timestamp="$(date +%Y%m%d-%H%M%S)"
artifacts_dir="${out_dir}/${node}-${timestamp}"
mkdir -p "${artifacts_dir}"
echo "Saving node and workload artifacts to ${artifacts_dir}"
kubectl get node "${node}" -o json > "${artifacts_dir}/node.json"
kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt"
kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt"
jq -r '
.metadata.labels
| to_entries[]
| select(
.key != "kubernetes.io/hostname"
and .key != "beta.kubernetes.io/hostname"
and .key != "node.kubernetes.io/instance-type"
and .key != "beta.kubernetes.io/instance-type"
and (.key | startswith("kubernetes.io/") | not)
and (.key | startswith("beta.kubernetes.io/") | not)
and (.key | startswith("node.kubernetes.io/") | not)
)
| "kubectl label node <replacement-node> " + .key + "=" + .value + " --overwrite"
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh"
jq -r '
(.spec.taints // [])[]
| "kubectl taint node <replacement-node> "
+ .key
+ (if .value then "=" + .value else "" end)
+ ":"
+ .effect
+ " --overwrite"
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh"
chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh"
if [ "${skip_drain}" != "true" ]; then
echo "Cordoning ${node}"
kubectl cordon "${node}" || true
echo "Draining ${node}"
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then
echo "Standard drain failed; retrying with --force"
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then
echo "Force drain failed; retrying with --disable-eviction"
kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction
fi
fi
fi
if [ "${delete_node}" = "true" ]; then
echo "Deleting node object ${node}"
kubectl delete node "${node}" || true
fi
cat <<NEXT
Recovery prep complete for ${node}.
Artifacts: ${artifacts_dir}
Next steps:
1) Reimage/reprovision replacement host.
2) Rejoin k3s and wait for node Ready.
3) Reapply labels: ${artifacts_dir}/restore-labels.sh
4) Reapply taints: ${artifacts_dir}/restore-taints.sh
5) Validate pods and uncordon replacement when ready.
NEXT

View File

@ -1,5 +1,19 @@
# Metis (node recovery)
## Fast path (SD/media failure)
1. Run `scripts/node_recover.sh <node> --yes --delete-node` from `titan-iac`.
2. Reimage/reprovision the replacement host.
3. Rejoin the replacement node to k3s.
4. Reapply labels and taints from generated artifacts:
- `artifacts/node-recovery/<node>-<timestamp>/restore-labels.sh`
- `artifacts/node-recovery/<node>-<timestamp>/restore-taints.sh`
5. Verify workloads, then uncordon the replacement node.
### Notes
- `node_recover.sh` snapshots node labels/taints and current pod placement before drain.
- Use `--skip-drain` for a dead/unreachable node where only artifact capture is possible.
- Use `--delete-node` after drain (or for hard-dead nodes) so replacement join is clean.
## Node classes (current map)
- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)

View File

@ -40,15 +40,25 @@ spec:
memory: "512Mi"
limits:
memory: "1Gi"
nodeSelector:
node-role.kubernetes.io/worker: "true"
hardware: rpi5
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: jetson
operator: In
values:
- "true"
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: jetson
operator: In
values:
- "true"

View File

@ -51,7 +51,7 @@ spec:
service: |
[SERVICE]
Flush 1
Log_Level info
Log_Level warn
Daemon Off
Parsers_File parsers.conf
Parsers_File custom_parsers.conf
@ -74,7 +74,7 @@ spec:
Refresh_Interval 10
Rotate_Wait 30
Inotify_Watcher false
Read_from_Head On
Read_from_Head Off
DB /var/lib/fluent-bit/kube.db
storage.type filesystem
@ -82,7 +82,7 @@ spec:
Name systemd
Tag journald.*
Path /var/log/journal
Read_From_Tail Off
Read_From_Tail On
DB /var/lib/fluent-bit/systemd.db
storage.type filesystem
filters: |
@ -107,7 +107,7 @@ spec:
Logstash_Prefix kube
Replace_Dots On
Suppress_Type_Name On
Retry_Limit False
Retry_Limit 10
[OUTPUT]
Name es
@ -119,4 +119,4 @@ spec:
Logstash_Prefix journald
Replace_Dots On
Suppress_Type_Name On
Retry_Limit False
Retry_Limit 10

View File

@ -24,7 +24,17 @@ spec:
operator: Exists
effect: NoSchedule
nodeSelector:
hardware: rpi5
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi4
- rpi5
containers:
- name: node-log-rotation
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131

View File

@ -37,15 +37,25 @@ spec:
limits:
cpu: "200m"
memory: "512Mi"
nodeSelector:
node-role.kubernetes.io/worker: "true"
hardware: rpi5
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: jetson
operator: In
values:
- "true"
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: jetson
operator: In
values:
- "true"

View File

@ -40,17 +40,27 @@ spec:
discovery.type: single-node
plugins.security.disabled: true
node.store.allow_mmap: false
nodeSelector:
node-role.kubernetes.io/worker: "true"
hardware: rpi5
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: jetson
operator: In
values:
- "true"
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: jetson
operator: In
values:
- "true"
sysctlInit:
enabled: true

View File

@ -76,15 +76,25 @@ spec:
memory: "256Mi"
limits:
memory: "512Mi"
nodeSelector:
node-role.kubernetes.io/worker: "true"
hardware: rpi5
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: jetson
operator: In
values:
- "true"
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: jetson
operator: In
values:
- "true"

View File

@ -12,39 +12,77 @@ k3s_agent_dropin="/host/etc/systemd/system/k3s-agent.service.d/99-logging.conf"
k3s_image_gc_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf"
k3s_agent_image_gc_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf"
if [ ! -f "${journald_dropin}" ]; then
mkdir -p "$(dirname "${journald_dropin}")"
printf "[Journal]\nStorage=volatile\nRuntimeMaxUse=200M\nRuntimeKeepFree=512M\nMaxFileSec=1h\n" > "${journald_dropin}"
changed=1
journald_changed=1
ensure_dropin() {
local path="$1"
local owner="$2"
local new_content="$3"
local current=""
if [ -f "${path}" ]; then
current="$(cat "${path}" || true)"
fi
if [ "${current}" != "${new_content}" ]; then
mkdir -p "$(dirname "${path}")"
printf "%s\n" "${new_content}" > "${path}"
changed=1
case "${owner}" in
journald)
journald_changed=1
;;
k3s)
k3s_changed=1
;;
k3s-agent)
k3s_agent_changed=1
;;
esac
fi
}
ensure_dropin \
"${journald_dropin}" \
"journald" \
"[Journal]
Storage=volatile
RuntimeMaxUse=200M
RuntimeKeepFree=512M
MaxFileSec=1h"
if [ -f "/host/etc/systemd/system/k3s.service" ]; then
ensure_dropin \
"${k3s_dropin}" \
"k3s" \
"[Service]
Environment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"
Environment=\"K3S_KUBELET_ARG=container-log-max-files=2\""
fi
if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_dropin}" ]; then
mkdir -p "$(dirname "${k3s_dropin}")"
printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_dropin}"
changed=1
k3s_changed=1
if [ -f "/host/etc/systemd/system/k3s.service" ]; then
ensure_dropin \
"${k3s_image_gc_dropin}" \
"k3s" \
"[Service]
Environment=\"K3S_KUBELET_ARG=image-gc-high-threshold=65\"
Environment=\"K3S_KUBELET_ARG=image-gc-low-threshold=50\"
Environment=\"K3S_KUBELET_ARG=image-gc-minimum-available=8Gi\""
fi
if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_image_gc_dropin}" ]; then
mkdir -p "$(dirname "${k3s_image_gc_dropin}")"
printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_image_gc_dropin}"
changed=1
k3s_changed=1
if [ -f "/host/etc/systemd/system/k3s-agent.service" ]; then
ensure_dropin \
"${k3s_agent_dropin}" \
"k3s-agent" \
"[Service]
Environment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"
Environment=\"K3S_KUBELET_ARG=container-log-max-files=2\""
fi
if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_dropin}" ]; then
mkdir -p "$(dirname "${k3s_agent_dropin}")"
printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_agent_dropin}"
changed=1
k3s_agent_changed=1
fi
if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_image_gc_dropin}" ]; then
mkdir -p "$(dirname "${k3s_agent_image_gc_dropin}")"
printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_agent_image_gc_dropin}"
changed=1
k3s_agent_changed=1
if [ -f "/host/etc/systemd/system/k3s-agent.service" ]; then
ensure_dropin \
"${k3s_agent_image_gc_dropin}" \
"k3s-agent" \
"[Service]
Environment=\"K3S_KUBELET_ARG=image-gc-high-threshold=65\"
Environment=\"K3S_KUBELET_ARG=image-gc-low-threshold=50\"
Environment=\"K3S_KUBELET_ARG=image-gc-minimum-available=8Gi\""
fi
if [ "${changed}" -eq 1 ]; then

View File

@ -18,6 +18,7 @@ spec:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
maintenance.bstein.dev/restart-rev: "20260207-2"
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "maintenance"
vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
@ -105,7 +106,7 @@ spec:
node-role.kubernetes.io/worker: "true"
containers:
- name: ariadne
image: registry.bstein.dev/bstein/ariadne:0.1.0-0
image: registry.bstein.dev/bstein/ariadne:latest
imagePullPolicy: Always
command: ["/bin/sh", "-c"]
args:
@ -285,7 +286,7 @@ spec:
- name: ARIADNE_SCHEDULE_MAILU_SYNC
value: "30 4 * * *"
- name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
value: "0 5 * * *"
value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
value: "*/5 * * * *"
- name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
@ -293,23 +294,23 @@ spec:
- name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
value: "0 * * * *"
- name: ARIADNE_SCHEDULE_WGER_USER_SYNC
value: "0 5 * * *"
value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_WGER_ADMIN
value: "15 3 * * *"
- name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
value: "0 6 * * *"
value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_FIREFLY_CRON
value: "0 3 * * *"
- name: ARIADNE_SCHEDULE_POD_CLEANER
value: "0 * * * *"
value: "*/30 * * * *"
- name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
value: "23 3 * * *"
- name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
value: "30 4 * * 0"
value: "0 */4 * * *"
- name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
value: "0 * * * *"
value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_VAULT_OIDC
value: "0 * * * *"
value: "*/15 * * * *"
- name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
value: "*/5 * * * *"
- name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
@ -319,9 +320,9 @@ spec:
- name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
value: "*/10 * * * *"
- name: ARIADNE_SCHEDULE_CLUSTER_STATE
value: "*/15 * * * *"
value: "*/10 * * * *"
- name: ARIADNE_CLUSTER_STATE_KEEP
value: "168"
value: "720"
- name: WELCOME_EMAIL_ENABLED
value: "true"
- name: K8S_API_TIMEOUT_SEC
@ -330,6 +331,8 @@ spec:
value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
- name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
value: "5"
- name: ARIADNE_ALERTMANAGER_URL
value: http://alertmanager.monitoring.svc.cluster.local
- name: OPENSEARCH_URL
value: http://opensearch-master.logging.svc.cluster.local:9200
- name: OPENSEARCH_LIMIT_BYTES

View File

@ -33,17 +33,15 @@ spec:
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
env:
- name: SWEEP_INTERVAL_SEC
value: "21600"
value: "7200"
- name: HIGH_USAGE_PERCENT
value: "70"
- name: EMERGENCY_USAGE_PERCENT
value: "80"
- name: BASE_THRESHOLD_DAYS
value: "14"
- name: HIGH_USAGE_THRESHOLD_DAYS
value: "3"
- name: LOG_RETENTION_DAYS
value: "7"
- name: ORPHAN_POD_RETENTION_DAYS
value: "3"
- name: JOURNAL_MAX_SIZE
value: "200M"
securityContext:

View File

@ -3,96 +3,71 @@ set -eu
ONE_SHOT=${ONE_SHOT:-false}
SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
ORPHAN_POD_RETENTION_DAYS=${ORPHAN_POD_RETENTION_DAYS:-3}
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
cleanup_orphaned_hdd_pod_logs() {
if [ ! -d /host/var/log.hdd/pods ]; then
return 0
fi
ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY'
import os
import shutil
import time
hdd_pods = "/host/var/log.hdd/pods"
active_pods = "/host/var/log/pods"
retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3"))
cutoff = time.time() - (retention_days * 86400)
try:
active_names = set(os.listdir(active_pods))
except Exception:
active_names = set()
try:
hdd_names = os.listdir(hdd_pods)
except Exception:
hdd_names = []
for name in hdd_names:
path = os.path.join(hdd_pods, name)
if not os.path.isdir(path):
continue
if name in active_names:
continue
try:
mtime = os.path.getmtime(path)
except Exception:
continue
if mtime > cutoff:
continue
print(path)
shutil.rmtree(path, ignore_errors=True)
PY
}
sweep_once() {
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
threshold_days="${BASE_THRESHOLD_DAYS}"
# crictl image metadata frequently omits createdAt on this cluster; prune by
# runtime reachability whenever rootfs crosses pressure thresholds.
if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
fi
cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
import os
import time
cleanup_orphaned_hdd_pod_logs
days = int(os.environ.get("THRESHOLD_DAYS", "14"))
print(int(time.time()) - days * 86400)
PY
)
if [ -d /host/var/log.hdd/pods ]; then
find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
fi
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
import json
import os
import sys
import time
try:
data = json.load(sys.stdin)
except Exception:
print("", end="")
sys.exit(0)
cutoff = int(os.environ.get("CUTOFF", "0"))
running = set(os.environ.get("RUNNING", "").split())
skip = os.environ.get("SKIP", "").split()
now = int(time.time())
prune = []
def is_skip(tags):
if not tags:
return False
for t in tags:
for prefix in skip:
if prefix and t.startswith(prefix):
return True
return False
for img in data.get("images", []):
image_id = img.get("id", "")
if not image_id:
continue
if image_id in running:
continue
tags = img.get("repoTags") or []
if is_skip(tags):
continue
created = img.get("createdAt") or 0
try:
created = int(str(created)) // 1000000000
except Exception:
created = 0
if created and created > now:
created = now
if cutoff and created and created < cutoff:
prune.append(image_id)
seen = set()
for p in prune:
if p in seen:
continue
seen.add(p)
print(p)
PY
)
if [ -n "${prune_list}" ]; then
printf "%s" "${prune_list}" | while read -r image_id; do
if [ -n "${image_id}" ]; then
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
fi
done
if [ -d /host/var/log.hdd/containers ]; then
find /host/var/log.hdd/containers -xtype l -print -delete 2>/dev/null || true
fi
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
@ -100,9 +75,11 @@ PY
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
# Emergency pass for rootfs pressure on SD-backed nodes.
chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
find /host/var/log.hdd -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
fi
}

View File

@ -303,8 +303,56 @@ data:
summary: "node-image-sweeper not fully ready"
labels:
severity: warning
- uid: logging-node-log-rotation-not-ready
title: "Node log rotation guardrails not ready"
condition: C
for: "10m"
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: atlas-vm
model:
intervalMs: 60000
maxDataPoints: 43200
expr: kube_daemonset_status_number_ready{namespace="logging",daemonset="node-log-rotation"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="logging",daemonset="node-log-rotation"}
legendFormat: '{{daemonset}}'
datasource:
type: prometheus
uid: atlas-vm
- refId: B
datasourceUid: __expr__
model:
expression: A
intervalMs: 60000
maxDataPoints: 43200
reducer: last
type: reduce
- refId: C
datasourceUid: __expr__
model:
expression: B
intervalMs: 60000
maxDataPoints: 43200
type: threshold
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
reducer:
type: last
type: query
noDataState: NoData
execErrState: Error
annotations:
summary: "node-log-rotation is not fully ready"
labels:
severity: warning
- uid: maint-ariadne-image-sweeper-stale
title: "Ariadne image sweeper stale (schedule >8d)"
title: "Ariadne image sweeper stale (schedule >24h)"
condition: C
for: "5m"
data:
@ -338,7 +386,7 @@ data:
type: threshold
conditions:
- evaluator:
params: [691200]
params: [86400]
type: gt
operator:
type: and
@ -348,7 +396,7 @@ data:
noDataState: OK
execErrState: Error
annotations:
summary: "Ariadne image sweeper stale >8d since last success"
summary: "Ariadne image sweeper stale >24h since last success"
labels:
severity: warning
- uid: maint-cron-stale