maintenance: harden sd-write controls and recovery workflow
This commit is contained in:
parent
678d0efa2c
commit
be92017f4d
163
scripts/node_recover.sh
Executable file
163
scripts/node_recover.sh
Executable file
@ -0,0 +1,163 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat <<USAGE
|
||||
Usage: scripts/node_recover.sh <node-name> [options]
|
||||
|
||||
Options:
|
||||
--yes Skip confirmation prompt
|
||||
--skip-drain Do not cordon/drain; only capture recovery artifacts
|
||||
--delete-node Delete Node object after drain (for hard-dead node replacement)
|
||||
--out-dir <dir> Recovery artifact directory (default: ./artifacts/node-recovery)
|
||||
-h, --help Show this help
|
||||
USAGE
|
||||
}
|
||||
|
||||
if ! command -v kubectl >/dev/null 2>&1; then
|
||||
echo "kubectl is required" >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! command -v jq >/dev/null 2>&1; then
|
||||
echo "jq is required" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$#" -lt 1 ]; then
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
node=""
|
||||
assume_yes="false"
|
||||
skip_drain="false"
|
||||
delete_node="false"
|
||||
out_dir="./artifacts/node-recovery"
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--yes)
|
||||
assume_yes="true"
|
||||
shift
|
||||
;;
|
||||
--skip-drain)
|
||||
skip_drain="true"
|
||||
shift
|
||||
;;
|
||||
--delete-node)
|
||||
delete_node="true"
|
||||
shift
|
||||
;;
|
||||
--out-dir)
|
||||
out_dir="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
-*)
|
||||
echo "Unknown option: $1" >&2
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
if [ -z "${node}" ]; then
|
||||
node="$1"
|
||||
else
|
||||
echo "Unexpected argument: $1" >&2
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "${node}" ]; then
|
||||
echo "Node name is required" >&2
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! kubectl get node "${node}" >/dev/null 2>&1; then
|
||||
echo "Node ${node} not found in cluster API" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "${assume_yes}" != "true" ]; then
|
||||
echo "About to prepare recovery workflow for node: ${node}"
|
||||
echo "skip_drain=${skip_drain} delete_node=${delete_node}"
|
||||
read -r -p "Type the node name to continue: " confirm
|
||||
if [ "${confirm}" != "${node}" ]; then
|
||||
echo "Confirmation did not match node name; aborting."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
timestamp="$(date +%Y%m%d-%H%M%S)"
|
||||
artifacts_dir="${out_dir}/${node}-${timestamp}"
|
||||
mkdir -p "${artifacts_dir}"
|
||||
|
||||
echo "Saving node and workload artifacts to ${artifacts_dir}"
|
||||
kubectl get node "${node}" -o json > "${artifacts_dir}/node.json"
|
||||
kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt"
|
||||
kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt"
|
||||
|
||||
jq -r '
|
||||
.metadata.labels
|
||||
| to_entries[]
|
||||
| select(
|
||||
.key != "kubernetes.io/hostname"
|
||||
and .key != "beta.kubernetes.io/hostname"
|
||||
and .key != "node.kubernetes.io/instance-type"
|
||||
and .key != "beta.kubernetes.io/instance-type"
|
||||
and (.key | startswith("kubernetes.io/") | not)
|
||||
and (.key | startswith("beta.kubernetes.io/") | not)
|
||||
and (.key | startswith("node.kubernetes.io/") | not)
|
||||
)
|
||||
| "kubectl label node <replacement-node> " + .key + "=" + .value + " --overwrite"
|
||||
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh"
|
||||
|
||||
jq -r '
|
||||
(.spec.taints // [])[]
|
||||
| "kubectl taint node <replacement-node> "
|
||||
+ .key
|
||||
+ (if .value then "=" + .value else "" end)
|
||||
+ ":"
|
||||
+ .effect
|
||||
+ " --overwrite"
|
||||
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh"
|
||||
|
||||
chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh"
|
||||
|
||||
if [ "${skip_drain}" != "true" ]; then
|
||||
echo "Cordoning ${node}"
|
||||
kubectl cordon "${node}" || true
|
||||
|
||||
echo "Draining ${node}"
|
||||
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then
|
||||
echo "Standard drain failed; retrying with --force"
|
||||
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then
|
||||
echo "Force drain failed; retrying with --disable-eviction"
|
||||
kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "${delete_node}" = "true" ]; then
|
||||
echo "Deleting node object ${node}"
|
||||
kubectl delete node "${node}" || true
|
||||
fi
|
||||
|
||||
cat <<NEXT
|
||||
Recovery prep complete for ${node}.
|
||||
Artifacts: ${artifacts_dir}
|
||||
|
||||
Next steps:
|
||||
1) Reimage/reprovision replacement host.
|
||||
2) Rejoin k3s and wait for node Ready.
|
||||
3) Reapply labels: ${artifacts_dir}/restore-labels.sh
|
||||
4) Reapply taints: ${artifacts_dir}/restore-taints.sh
|
||||
5) Validate pods and uncordon replacement when ready.
|
||||
NEXT
|
||||
@ -1,5 +1,19 @@
|
||||
# Metis (node recovery)
|
||||
|
||||
## Fast path (SD/media failure)
|
||||
1. Run `scripts/node_recover.sh <node> --yes --delete-node` from `titan-iac`.
|
||||
2. Reimage/reprovision the replacement host.
|
||||
3. Rejoin the replacement node to k3s.
|
||||
4. Reapply labels and taints from generated artifacts:
|
||||
- `artifacts/node-recovery/<node>-<timestamp>/restore-labels.sh`
|
||||
- `artifacts/node-recovery/<node>-<timestamp>/restore-taints.sh`
|
||||
5. Verify workloads, then uncordon the replacement node.
|
||||
|
||||
### Notes
|
||||
- `node_recover.sh` snapshots node labels/taints and current pod placement before drain.
|
||||
- Use `--skip-drain` for a dead/unreachable node where only artifact capture is possible.
|
||||
- Use `--delete-node` after drain (or for hard-dead nodes) so replacement join is clean.
|
||||
|
||||
## Node classes (current map)
|
||||
- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
|
||||
- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
|
||||
|
||||
@ -40,15 +40,25 @@ spec:
|
||||
memory: "512Mi"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
hardware: rpi5
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: jetson
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: jetson
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
|
||||
@ -51,7 +51,7 @@ spec:
|
||||
service: |
|
||||
[SERVICE]
|
||||
Flush 1
|
||||
Log_Level info
|
||||
Log_Level warn
|
||||
Daemon Off
|
||||
Parsers_File parsers.conf
|
||||
Parsers_File custom_parsers.conf
|
||||
@ -74,7 +74,7 @@ spec:
|
||||
Refresh_Interval 10
|
||||
Rotate_Wait 30
|
||||
Inotify_Watcher false
|
||||
Read_from_Head On
|
||||
Read_from_Head Off
|
||||
DB /var/lib/fluent-bit/kube.db
|
||||
storage.type filesystem
|
||||
|
||||
@ -82,7 +82,7 @@ spec:
|
||||
Name systemd
|
||||
Tag journald.*
|
||||
Path /var/log/journal
|
||||
Read_From_Tail Off
|
||||
Read_From_Tail On
|
||||
DB /var/lib/fluent-bit/systemd.db
|
||||
storage.type filesystem
|
||||
filters: |
|
||||
@ -107,7 +107,7 @@ spec:
|
||||
Logstash_Prefix kube
|
||||
Replace_Dots On
|
||||
Suppress_Type_Name On
|
||||
Retry_Limit False
|
||||
Retry_Limit 10
|
||||
|
||||
[OUTPUT]
|
||||
Name es
|
||||
@ -119,4 +119,4 @@ spec:
|
||||
Logstash_Prefix journald
|
||||
Replace_Dots On
|
||||
Suppress_Type_Name On
|
||||
Retry_Limit False
|
||||
Retry_Limit 10
|
||||
|
||||
@ -24,7 +24,17 @@ spec:
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
nodeSelector:
|
||||
hardware: rpi5
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi4
|
||||
- rpi5
|
||||
containers:
|
||||
- name: node-log-rotation
|
||||
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
||||
|
||||
@ -37,15 +37,25 @@ spec:
|
||||
limits:
|
||||
cpu: "200m"
|
||||
memory: "512Mi"
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
hardware: rpi5
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: jetson
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: jetson
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
|
||||
@ -40,17 +40,27 @@ spec:
|
||||
discovery.type: single-node
|
||||
plugins.security.disabled: true
|
||||
node.store.allow_mmap: false
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
hardware: rpi5
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: jetson
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: jetson
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
sysctlInit:
|
||||
enabled: true
|
||||
|
||||
@ -76,15 +76,25 @@ spec:
|
||||
memory: "256Mi"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
hardware: rpi5
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: jetson
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: jetson
|
||||
operator: In
|
||||
values:
|
||||
- "true"
|
||||
|
||||
@ -12,39 +12,77 @@ k3s_agent_dropin="/host/etc/systemd/system/k3s-agent.service.d/99-logging.conf"
|
||||
k3s_image_gc_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf"
|
||||
k3s_agent_image_gc_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf"
|
||||
|
||||
if [ ! -f "${journald_dropin}" ]; then
|
||||
mkdir -p "$(dirname "${journald_dropin}")"
|
||||
printf "[Journal]\nStorage=volatile\nRuntimeMaxUse=200M\nRuntimeKeepFree=512M\nMaxFileSec=1h\n" > "${journald_dropin}"
|
||||
changed=1
|
||||
journald_changed=1
|
||||
ensure_dropin() {
|
||||
local path="$1"
|
||||
local owner="$2"
|
||||
local new_content="$3"
|
||||
local current=""
|
||||
if [ -f "${path}" ]; then
|
||||
current="$(cat "${path}" || true)"
|
||||
fi
|
||||
if [ "${current}" != "${new_content}" ]; then
|
||||
mkdir -p "$(dirname "${path}")"
|
||||
printf "%s\n" "${new_content}" > "${path}"
|
||||
changed=1
|
||||
case "${owner}" in
|
||||
journald)
|
||||
journald_changed=1
|
||||
;;
|
||||
k3s)
|
||||
k3s_changed=1
|
||||
;;
|
||||
k3s-agent)
|
||||
k3s_agent_changed=1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_dropin \
|
||||
"${journald_dropin}" \
|
||||
"journald" \
|
||||
"[Journal]
|
||||
Storage=volatile
|
||||
RuntimeMaxUse=200M
|
||||
RuntimeKeepFree=512M
|
||||
MaxFileSec=1h"
|
||||
|
||||
if [ -f "/host/etc/systemd/system/k3s.service" ]; then
|
||||
ensure_dropin \
|
||||
"${k3s_dropin}" \
|
||||
"k3s" \
|
||||
"[Service]
|
||||
Environment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"
|
||||
Environment=\"K3S_KUBELET_ARG=container-log-max-files=2\""
|
||||
fi
|
||||
|
||||
if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_dropin}" ]; then
|
||||
mkdir -p "$(dirname "${k3s_dropin}")"
|
||||
printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_dropin}"
|
||||
changed=1
|
||||
k3s_changed=1
|
||||
if [ -f "/host/etc/systemd/system/k3s.service" ]; then
|
||||
ensure_dropin \
|
||||
"${k3s_image_gc_dropin}" \
|
||||
"k3s" \
|
||||
"[Service]
|
||||
Environment=\"K3S_KUBELET_ARG=image-gc-high-threshold=65\"
|
||||
Environment=\"K3S_KUBELET_ARG=image-gc-low-threshold=50\"
|
||||
Environment=\"K3S_KUBELET_ARG=image-gc-minimum-available=8Gi\""
|
||||
fi
|
||||
|
||||
if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_image_gc_dropin}" ]; then
|
||||
mkdir -p "$(dirname "${k3s_image_gc_dropin}")"
|
||||
printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_image_gc_dropin}"
|
||||
changed=1
|
||||
k3s_changed=1
|
||||
if [ -f "/host/etc/systemd/system/k3s-agent.service" ]; then
|
||||
ensure_dropin \
|
||||
"${k3s_agent_dropin}" \
|
||||
"k3s-agent" \
|
||||
"[Service]
|
||||
Environment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"
|
||||
Environment=\"K3S_KUBELET_ARG=container-log-max-files=2\""
|
||||
fi
|
||||
|
||||
if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_dropin}" ]; then
|
||||
mkdir -p "$(dirname "${k3s_agent_dropin}")"
|
||||
printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_agent_dropin}"
|
||||
changed=1
|
||||
k3s_agent_changed=1
|
||||
fi
|
||||
|
||||
if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_image_gc_dropin}" ]; then
|
||||
mkdir -p "$(dirname "${k3s_agent_image_gc_dropin}")"
|
||||
printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_agent_image_gc_dropin}"
|
||||
changed=1
|
||||
k3s_agent_changed=1
|
||||
if [ -f "/host/etc/systemd/system/k3s-agent.service" ]; then
|
||||
ensure_dropin \
|
||||
"${k3s_agent_image_gc_dropin}" \
|
||||
"k3s-agent" \
|
||||
"[Service]
|
||||
Environment=\"K3S_KUBELET_ARG=image-gc-high-threshold=65\"
|
||||
Environment=\"K3S_KUBELET_ARG=image-gc-low-threshold=50\"
|
||||
Environment=\"K3S_KUBELET_ARG=image-gc-minimum-available=8Gi\""
|
||||
fi
|
||||
|
||||
if [ "${changed}" -eq 1 ]; then
|
||||
|
||||
@ -18,6 +18,7 @@ spec:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8080"
|
||||
prometheus.io/path: "/metrics"
|
||||
maintenance.bstein.dev/restart-rev: "20260207-2"
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "maintenance"
|
||||
vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
|
||||
@ -105,7 +106,7 @@ spec:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
containers:
|
||||
- name: ariadne
|
||||
image: registry.bstein.dev/bstein/ariadne:0.1.0-0
|
||||
image: registry.bstein.dev/bstein/ariadne:latest
|
||||
imagePullPolicy: Always
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
@ -285,7 +286,7 @@ spec:
|
||||
- name: ARIADNE_SCHEDULE_MAILU_SYNC
|
||||
value: "30 4 * * *"
|
||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
|
||||
value: "0 5 * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
|
||||
value: "*/5 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
|
||||
@ -293,23 +294,23 @@ spec:
|
||||
- name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
|
||||
value: "0 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_WGER_USER_SYNC
|
||||
value: "0 5 * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_WGER_ADMIN
|
||||
value: "15 3 * * *"
|
||||
- name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
|
||||
value: "0 6 * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_FIREFLY_CRON
|
||||
value: "0 3 * * *"
|
||||
- name: ARIADNE_SCHEDULE_POD_CLEANER
|
||||
value: "0 * * * *"
|
||||
value: "*/30 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
|
||||
value: "23 3 * * *"
|
||||
- name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
|
||||
value: "30 4 * * 0"
|
||||
value: "0 */4 * * *"
|
||||
- name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
|
||||
value: "0 * * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_VAULT_OIDC
|
||||
value: "0 * * * *"
|
||||
value: "*/15 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
|
||||
value: "*/5 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
|
||||
@ -319,9 +320,9 @@ spec:
|
||||
- name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
|
||||
value: "*/10 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_CLUSTER_STATE
|
||||
value: "*/15 * * * *"
|
||||
value: "*/10 * * * *"
|
||||
- name: ARIADNE_CLUSTER_STATE_KEEP
|
||||
value: "168"
|
||||
value: "720"
|
||||
- name: WELCOME_EMAIL_ENABLED
|
||||
value: "true"
|
||||
- name: K8S_API_TIMEOUT_SEC
|
||||
@ -330,6 +331,8 @@ spec:
|
||||
value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
|
||||
- name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
|
||||
value: "5"
|
||||
- name: ARIADNE_ALERTMANAGER_URL
|
||||
value: http://alertmanager.monitoring.svc.cluster.local
|
||||
- name: OPENSEARCH_URL
|
||||
value: http://opensearch-master.logging.svc.cluster.local:9200
|
||||
- name: OPENSEARCH_LIMIT_BYTES
|
||||
|
||||
@ -33,17 +33,15 @@ spec:
|
||||
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
|
||||
env:
|
||||
- name: SWEEP_INTERVAL_SEC
|
||||
value: "21600"
|
||||
value: "7200"
|
||||
- name: HIGH_USAGE_PERCENT
|
||||
value: "70"
|
||||
- name: EMERGENCY_USAGE_PERCENT
|
||||
value: "80"
|
||||
- name: BASE_THRESHOLD_DAYS
|
||||
value: "14"
|
||||
- name: HIGH_USAGE_THRESHOLD_DAYS
|
||||
value: "3"
|
||||
- name: LOG_RETENTION_DAYS
|
||||
value: "7"
|
||||
- name: ORPHAN_POD_RETENTION_DAYS
|
||||
value: "3"
|
||||
- name: JOURNAL_MAX_SIZE
|
||||
value: "200M"
|
||||
securityContext:
|
||||
|
||||
@ -3,96 +3,71 @@ set -eu
|
||||
|
||||
ONE_SHOT=${ONE_SHOT:-false}
|
||||
SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
|
||||
BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
|
||||
HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
|
||||
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
|
||||
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
|
||||
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
|
||||
ORPHAN_POD_RETENTION_DAYS=${ORPHAN_POD_RETENTION_DAYS:-3}
|
||||
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
|
||||
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
|
||||
|
||||
cleanup_orphaned_hdd_pod_logs() {
|
||||
if [ ! -d /host/var/log.hdd/pods ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY'
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
|
||||
hdd_pods = "/host/var/log.hdd/pods"
|
||||
active_pods = "/host/var/log/pods"
|
||||
retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3"))
|
||||
cutoff = time.time() - (retention_days * 86400)
|
||||
|
||||
try:
|
||||
active_names = set(os.listdir(active_pods))
|
||||
except Exception:
|
||||
active_names = set()
|
||||
|
||||
try:
|
||||
hdd_names = os.listdir(hdd_pods)
|
||||
except Exception:
|
||||
hdd_names = []
|
||||
|
||||
for name in hdd_names:
|
||||
path = os.path.join(hdd_pods, name)
|
||||
if not os.path.isdir(path):
|
||||
continue
|
||||
if name in active_names:
|
||||
continue
|
||||
try:
|
||||
mtime = os.path.getmtime(path)
|
||||
except Exception:
|
||||
continue
|
||||
if mtime > cutoff:
|
||||
continue
|
||||
print(path)
|
||||
shutil.rmtree(path, ignore_errors=True)
|
||||
PY
|
||||
}
|
||||
|
||||
sweep_once() {
|
||||
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
|
||||
threshold_days="${BASE_THRESHOLD_DAYS}"
|
||||
|
||||
# crictl image metadata frequently omits createdAt on this cluster; prune by
|
||||
# runtime reachability whenever rootfs crosses pressure thresholds.
|
||||
if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
|
||||
threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
|
||||
chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
|
||||
fi
|
||||
|
||||
cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
|
||||
import os
|
||||
import time
|
||||
cleanup_orphaned_hdd_pod_logs
|
||||
|
||||
days = int(os.environ.get("THRESHOLD_DAYS", "14"))
|
||||
print(int(time.time()) - days * 86400)
|
||||
PY
|
||||
)
|
||||
if [ -d /host/var/log.hdd/pods ]; then
|
||||
find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||
fi
|
||||
|
||||
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
|
||||
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
|
||||
|
||||
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
except Exception:
|
||||
print("", end="")
|
||||
sys.exit(0)
|
||||
|
||||
cutoff = int(os.environ.get("CUTOFF", "0"))
|
||||
running = set(os.environ.get("RUNNING", "").split())
|
||||
skip = os.environ.get("SKIP", "").split()
|
||||
now = int(time.time())
|
||||
prune = []
|
||||
|
||||
|
||||
def is_skip(tags):
|
||||
if not tags:
|
||||
return False
|
||||
for t in tags:
|
||||
for prefix in skip:
|
||||
if prefix and t.startswith(prefix):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
for img in data.get("images", []):
|
||||
image_id = img.get("id", "")
|
||||
if not image_id:
|
||||
continue
|
||||
if image_id in running:
|
||||
continue
|
||||
tags = img.get("repoTags") or []
|
||||
if is_skip(tags):
|
||||
continue
|
||||
created = img.get("createdAt") or 0
|
||||
try:
|
||||
created = int(str(created)) // 1000000000
|
||||
except Exception:
|
||||
created = 0
|
||||
if created and created > now:
|
||||
created = now
|
||||
if cutoff and created and created < cutoff:
|
||||
prune.append(image_id)
|
||||
|
||||
seen = set()
|
||||
for p in prune:
|
||||
if p in seen:
|
||||
continue
|
||||
seen.add(p)
|
||||
print(p)
|
||||
PY
|
||||
)
|
||||
|
||||
if [ -n "${prune_list}" ]; then
|
||||
printf "%s" "${prune_list}" | while read -r image_id; do
|
||||
if [ -n "${image_id}" ]; then
|
||||
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
|
||||
fi
|
||||
done
|
||||
if [ -d /host/var/log.hdd/containers ]; then
|
||||
find /host/var/log.hdd/containers -xtype l -print -delete 2>/dev/null || true
|
||||
fi
|
||||
|
||||
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
|
||||
@ -100,9 +75,11 @@ PY
|
||||
|
||||
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
|
||||
# Emergency pass for rootfs pressure on SD-backed nodes.
|
||||
chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
|
||||
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
|
||||
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||
find /host/var/log.hdd -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
|
||||
fi
|
||||
}
|
||||
|
||||
@ -303,8 +303,56 @@ data:
|
||||
summary: "node-image-sweeper not fully ready"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: logging-node-log-rotation-not-ready
|
||||
title: "Node log rotation guardrails not ready"
|
||||
condition: C
|
||||
for: "10m"
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: atlas-vm
|
||||
model:
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
expr: kube_daemonset_status_number_ready{namespace="logging",daemonset="node-log-rotation"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="logging",daemonset="node-log-rotation"}
|
||||
legendFormat: '{{daemonset}}'
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: atlas-vm
|
||||
- refId: B
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: A
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
type: reduce
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
expression: B
|
||||
intervalMs: 60000
|
||||
maxDataPoints: 43200
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [1]
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
reducer:
|
||||
type: last
|
||||
type: query
|
||||
noDataState: NoData
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "node-log-rotation is not fully ready"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: maint-ariadne-image-sweeper-stale
|
||||
title: "Ariadne image sweeper stale (schedule >8d)"
|
||||
title: "Ariadne image sweeper stale (schedule >24h)"
|
||||
condition: C
|
||||
for: "5m"
|
||||
data:
|
||||
@ -338,7 +386,7 @@ data:
|
||||
type: threshold
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [691200]
|
||||
params: [86400]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
@ -348,7 +396,7 @@ data:
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
annotations:
|
||||
summary: "Ariadne image sweeper stale >8d since last success"
|
||||
summary: "Ariadne image sweeper stale >24h since last success"
|
||||
labels:
|
||||
severity: warning
|
||||
- uid: maint-cron-stale
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user