maintenance: harden sd-write controls and recovery workflow
This commit is contained in:
parent
7a70c22a46
commit
03ae79df3e
163
scripts/node_recover.sh
Executable file
163
scripts/node_recover.sh
Executable file
@ -0,0 +1,163 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<USAGE
|
||||||
|
Usage: scripts/node_recover.sh <node-name> [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--yes Skip confirmation prompt
|
||||||
|
--skip-drain Do not cordon/drain; only capture recovery artifacts
|
||||||
|
--delete-node Delete Node object after drain (for hard-dead node replacement)
|
||||||
|
--out-dir <dir> Recovery artifact directory (default: ./artifacts/node-recovery)
|
||||||
|
-h, --help Show this help
|
||||||
|
USAGE
|
||||||
|
}
|
||||||
|
|
||||||
|
if ! command -v kubectl >/dev/null 2>&1; then
|
||||||
|
echo "kubectl is required" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! command -v jq >/dev/null 2>&1; then
|
||||||
|
echo "jq is required" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$#" -lt 1 ]; then
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
node=""
|
||||||
|
assume_yes="false"
|
||||||
|
skip_drain="false"
|
||||||
|
delete_node="false"
|
||||||
|
out_dir="./artifacts/node-recovery"
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--yes)
|
||||||
|
assume_yes="true"
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--skip-drain)
|
||||||
|
skip_drain="true"
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--delete-node)
|
||||||
|
delete_node="true"
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
--out-dir)
|
||||||
|
out_dir="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-h|--help)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
-*)
|
||||||
|
echo "Unknown option: $1" >&2
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
if [ -z "${node}" ]; then
|
||||||
|
node="$1"
|
||||||
|
else
|
||||||
|
echo "Unexpected argument: $1" >&2
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
shift
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -z "${node}" ]; then
|
||||||
|
echo "Node name is required" >&2
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! kubectl get node "${node}" >/dev/null 2>&1; then
|
||||||
|
echo "Node ${node} not found in cluster API" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${assume_yes}" != "true" ]; then
|
||||||
|
echo "About to prepare recovery workflow for node: ${node}"
|
||||||
|
echo "skip_drain=${skip_drain} delete_node=${delete_node}"
|
||||||
|
read -r -p "Type the node name to continue: " confirm
|
||||||
|
if [ "${confirm}" != "${node}" ]; then
|
||||||
|
echo "Confirmation did not match node name; aborting."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
timestamp="$(date +%Y%m%d-%H%M%S)"
|
||||||
|
artifacts_dir="${out_dir}/${node}-${timestamp}"
|
||||||
|
mkdir -p "${artifacts_dir}"
|
||||||
|
|
||||||
|
echo "Saving node and workload artifacts to ${artifacts_dir}"
|
||||||
|
kubectl get node "${node}" -o json > "${artifacts_dir}/node.json"
|
||||||
|
kubectl get node "${node}" --show-labels > "${artifacts_dir}/node.txt"
|
||||||
|
kubectl get pods -A --field-selector "spec.nodeName=${node}" -o wide > "${artifacts_dir}/pods-on-node.txt"
|
||||||
|
|
||||||
|
jq -r '
|
||||||
|
.metadata.labels
|
||||||
|
| to_entries[]
|
||||||
|
| select(
|
||||||
|
.key != "kubernetes.io/hostname"
|
||||||
|
and .key != "beta.kubernetes.io/hostname"
|
||||||
|
and .key != "node.kubernetes.io/instance-type"
|
||||||
|
and .key != "beta.kubernetes.io/instance-type"
|
||||||
|
and (.key | startswith("kubernetes.io/") | not)
|
||||||
|
and (.key | startswith("beta.kubernetes.io/") | not)
|
||||||
|
and (.key | startswith("node.kubernetes.io/") | not)
|
||||||
|
)
|
||||||
|
| "kubectl label node <replacement-node> " + .key + "=" + .value + " --overwrite"
|
||||||
|
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-labels.sh"
|
||||||
|
|
||||||
|
jq -r '
|
||||||
|
(.spec.taints // [])[]
|
||||||
|
| "kubectl taint node <replacement-node> "
|
||||||
|
+ .key
|
||||||
|
+ (if .value then "=" + .value else "" end)
|
||||||
|
+ ":"
|
||||||
|
+ .effect
|
||||||
|
+ " --overwrite"
|
||||||
|
' "${artifacts_dir}/node.json" > "${artifacts_dir}/restore-taints.sh"
|
||||||
|
|
||||||
|
chmod +x "${artifacts_dir}/restore-labels.sh" "${artifacts_dir}/restore-taints.sh"
|
||||||
|
|
||||||
|
if [ "${skip_drain}" != "true" ]; then
|
||||||
|
echo "Cordoning ${node}"
|
||||||
|
kubectl cordon "${node}" || true
|
||||||
|
|
||||||
|
echo "Draining ${node}"
|
||||||
|
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m; then
|
||||||
|
echo "Standard drain failed; retrying with --force"
|
||||||
|
if ! kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force; then
|
||||||
|
echo "Force drain failed; retrying with --disable-eviction"
|
||||||
|
kubectl drain "${node}" --ignore-daemonsets --delete-emptydir-data --grace-period=30 --timeout=20m --force --disable-eviction
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${delete_node}" = "true" ]; then
|
||||||
|
echo "Deleting node object ${node}"
|
||||||
|
kubectl delete node "${node}" || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
cat <<NEXT
|
||||||
|
Recovery prep complete for ${node}.
|
||||||
|
Artifacts: ${artifacts_dir}
|
||||||
|
|
||||||
|
Next steps:
|
||||||
|
1) Reimage/reprovision replacement host.
|
||||||
|
2) Rejoin k3s and wait for node Ready.
|
||||||
|
3) Reapply labels: ${artifacts_dir}/restore-labels.sh
|
||||||
|
4) Reapply taints: ${artifacts_dir}/restore-taints.sh
|
||||||
|
5) Validate pods and uncordon replacement when ready.
|
||||||
|
NEXT
|
||||||
@ -1,5 +1,19 @@
|
|||||||
# Metis (node recovery)
|
# Metis (node recovery)
|
||||||
|
|
||||||
|
## Fast path (SD/media failure)
|
||||||
|
1. Run `scripts/node_recover.sh <node> --yes --delete-node` from `titan-iac`.
|
||||||
|
2. Reimage/reprovision the replacement host.
|
||||||
|
3. Rejoin the replacement node to k3s.
|
||||||
|
4. Reapply labels and taints from generated artifacts:
|
||||||
|
- `artifacts/node-recovery/<node>-<timestamp>/restore-labels.sh`
|
||||||
|
- `artifacts/node-recovery/<node>-<timestamp>/restore-taints.sh`
|
||||||
|
5. Verify workloads, then uncordon the replacement node.
|
||||||
|
|
||||||
|
### Notes
|
||||||
|
- `node_recover.sh` snapshots node labels/taints and current pod placement before drain.
|
||||||
|
- Use `--skip-drain` for a dead/unreachable node where only artifact capture is possible.
|
||||||
|
- Use `--delete-node` after drain (or for hard-dead nodes) so replacement join is clean.
|
||||||
|
|
||||||
## Node classes (current map)
|
## Node classes (current map)
|
||||||
- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
|
- rpi5 Ubuntu workers: titan-04,05,06,07,08,09,10,11,20,21 (Ubuntu 24.04.3, k3s agent)
|
||||||
- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
|
- rpi5 control-plane: titan-0a/0b/0c (Ubuntu 24.04.1, k3s server, control-plane taint)
|
||||||
|
|||||||
@ -40,15 +40,25 @@ spec:
|
|||||||
memory: "512Mi"
|
memory: "512Mi"
|
||||||
limits:
|
limits:
|
||||||
memory: "1Gi"
|
memory: "1Gi"
|
||||||
nodeSelector:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
hardware: rpi5
|
|
||||||
affinity:
|
affinity:
|
||||||
nodeAffinity:
|
nodeAffinity:
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
nodeSelectorTerms:
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: jetson
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- "true"
|
||||||
- matchExpressions:
|
- matchExpressions:
|
||||||
- key: hardware
|
- key: hardware
|
||||||
operator: In
|
operator: In
|
||||||
values:
|
values:
|
||||||
- rpi5
|
- rpi5
|
||||||
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
- weight: 100
|
||||||
|
preference:
|
||||||
|
matchExpressions:
|
||||||
|
- key: jetson
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- "true"
|
||||||
|
|||||||
@ -51,7 +51,7 @@ spec:
|
|||||||
service: |
|
service: |
|
||||||
[SERVICE]
|
[SERVICE]
|
||||||
Flush 1
|
Flush 1
|
||||||
Log_Level info
|
Log_Level warn
|
||||||
Daemon Off
|
Daemon Off
|
||||||
Parsers_File parsers.conf
|
Parsers_File parsers.conf
|
||||||
Parsers_File custom_parsers.conf
|
Parsers_File custom_parsers.conf
|
||||||
@ -74,7 +74,7 @@ spec:
|
|||||||
Refresh_Interval 10
|
Refresh_Interval 10
|
||||||
Rotate_Wait 30
|
Rotate_Wait 30
|
||||||
Inotify_Watcher false
|
Inotify_Watcher false
|
||||||
Read_from_Head On
|
Read_from_Head Off
|
||||||
DB /var/lib/fluent-bit/kube.db
|
DB /var/lib/fluent-bit/kube.db
|
||||||
storage.type filesystem
|
storage.type filesystem
|
||||||
|
|
||||||
@ -82,7 +82,7 @@ spec:
|
|||||||
Name systemd
|
Name systemd
|
||||||
Tag journald.*
|
Tag journald.*
|
||||||
Path /var/log/journal
|
Path /var/log/journal
|
||||||
Read_From_Tail Off
|
Read_From_Tail On
|
||||||
DB /var/lib/fluent-bit/systemd.db
|
DB /var/lib/fluent-bit/systemd.db
|
||||||
storage.type filesystem
|
storage.type filesystem
|
||||||
filters: |
|
filters: |
|
||||||
@ -107,7 +107,7 @@ spec:
|
|||||||
Logstash_Prefix kube
|
Logstash_Prefix kube
|
||||||
Replace_Dots On
|
Replace_Dots On
|
||||||
Suppress_Type_Name On
|
Suppress_Type_Name On
|
||||||
Retry_Limit False
|
Retry_Limit 10
|
||||||
|
|
||||||
[OUTPUT]
|
[OUTPUT]
|
||||||
Name es
|
Name es
|
||||||
@ -119,4 +119,4 @@ spec:
|
|||||||
Logstash_Prefix journald
|
Logstash_Prefix journald
|
||||||
Replace_Dots On
|
Replace_Dots On
|
||||||
Suppress_Type_Name On
|
Suppress_Type_Name On
|
||||||
Retry_Limit False
|
Retry_Limit 10
|
||||||
|
|||||||
@ -24,7 +24,17 @@ spec:
|
|||||||
operator: Exists
|
operator: Exists
|
||||||
effect: NoSchedule
|
effect: NoSchedule
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
hardware: rpi5
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: hardware
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- rpi4
|
||||||
|
- rpi5
|
||||||
containers:
|
containers:
|
||||||
- name: node-log-rotation
|
- name: node-log-rotation
|
||||||
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
||||||
|
|||||||
@ -37,15 +37,25 @@ spec:
|
|||||||
limits:
|
limits:
|
||||||
cpu: "200m"
|
cpu: "200m"
|
||||||
memory: "512Mi"
|
memory: "512Mi"
|
||||||
nodeSelector:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
hardware: rpi5
|
|
||||||
affinity:
|
affinity:
|
||||||
nodeAffinity:
|
nodeAffinity:
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
nodeSelectorTerms:
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: jetson
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- "true"
|
||||||
- matchExpressions:
|
- matchExpressions:
|
||||||
- key: hardware
|
- key: hardware
|
||||||
operator: In
|
operator: In
|
||||||
values:
|
values:
|
||||||
- rpi5
|
- rpi5
|
||||||
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
- weight: 100
|
||||||
|
preference:
|
||||||
|
matchExpressions:
|
||||||
|
- key: jetson
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- "true"
|
||||||
|
|||||||
@ -40,17 +40,27 @@ spec:
|
|||||||
discovery.type: single-node
|
discovery.type: single-node
|
||||||
plugins.security.disabled: true
|
plugins.security.disabled: true
|
||||||
node.store.allow_mmap: false
|
node.store.allow_mmap: false
|
||||||
nodeSelector:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
hardware: rpi5
|
|
||||||
affinity:
|
affinity:
|
||||||
nodeAffinity:
|
nodeAffinity:
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
nodeSelectorTerms:
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: jetson
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- "true"
|
||||||
- matchExpressions:
|
- matchExpressions:
|
||||||
- key: hardware
|
- key: hardware
|
||||||
operator: In
|
operator: In
|
||||||
values:
|
values:
|
||||||
- rpi5
|
- rpi5
|
||||||
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
- weight: 100
|
||||||
|
preference:
|
||||||
|
matchExpressions:
|
||||||
|
- key: jetson
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- "true"
|
||||||
sysctlInit:
|
sysctlInit:
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|||||||
@ -76,15 +76,25 @@ spec:
|
|||||||
memory: "256Mi"
|
memory: "256Mi"
|
||||||
limits:
|
limits:
|
||||||
memory: "512Mi"
|
memory: "512Mi"
|
||||||
nodeSelector:
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
hardware: rpi5
|
|
||||||
affinity:
|
affinity:
|
||||||
nodeAffinity:
|
nodeAffinity:
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
nodeSelectorTerms:
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: jetson
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- "true"
|
||||||
- matchExpressions:
|
- matchExpressions:
|
||||||
- key: hardware
|
- key: hardware
|
||||||
operator: In
|
operator: In
|
||||||
values:
|
values:
|
||||||
- rpi5
|
- rpi5
|
||||||
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
- weight: 100
|
||||||
|
preference:
|
||||||
|
matchExpressions:
|
||||||
|
- key: jetson
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- "true"
|
||||||
|
|||||||
@ -12,39 +12,77 @@ k3s_agent_dropin="/host/etc/systemd/system/k3s-agent.service.d/99-logging.conf"
|
|||||||
k3s_image_gc_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf"
|
k3s_image_gc_dropin="/host/etc/systemd/system/k3s.service.d/98-image-gc.conf"
|
||||||
k3s_agent_image_gc_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf"
|
k3s_agent_image_gc_dropin="/host/etc/systemd/system/k3s-agent.service.d/98-image-gc.conf"
|
||||||
|
|
||||||
if [ ! -f "${journald_dropin}" ]; then
|
ensure_dropin() {
|
||||||
mkdir -p "$(dirname "${journald_dropin}")"
|
local path="$1"
|
||||||
printf "[Journal]\nStorage=volatile\nRuntimeMaxUse=200M\nRuntimeKeepFree=512M\nMaxFileSec=1h\n" > "${journald_dropin}"
|
local owner="$2"
|
||||||
changed=1
|
local new_content="$3"
|
||||||
journald_changed=1
|
local current=""
|
||||||
|
if [ -f "${path}" ]; then
|
||||||
|
current="$(cat "${path}" || true)"
|
||||||
|
fi
|
||||||
|
if [ "${current}" != "${new_content}" ]; then
|
||||||
|
mkdir -p "$(dirname "${path}")"
|
||||||
|
printf "%s\n" "${new_content}" > "${path}"
|
||||||
|
changed=1
|
||||||
|
case "${owner}" in
|
||||||
|
journald)
|
||||||
|
journald_changed=1
|
||||||
|
;;
|
||||||
|
k3s)
|
||||||
|
k3s_changed=1
|
||||||
|
;;
|
||||||
|
k3s-agent)
|
||||||
|
k3s_agent_changed=1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_dropin \
|
||||||
|
"${journald_dropin}" \
|
||||||
|
"journald" \
|
||||||
|
"[Journal]
|
||||||
|
Storage=volatile
|
||||||
|
RuntimeMaxUse=200M
|
||||||
|
RuntimeKeepFree=512M
|
||||||
|
MaxFileSec=1h"
|
||||||
|
|
||||||
|
if [ -f "/host/etc/systemd/system/k3s.service" ]; then
|
||||||
|
ensure_dropin \
|
||||||
|
"${k3s_dropin}" \
|
||||||
|
"k3s" \
|
||||||
|
"[Service]
|
||||||
|
Environment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"
|
||||||
|
Environment=\"K3S_KUBELET_ARG=container-log-max-files=2\""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_dropin}" ]; then
|
if [ -f "/host/etc/systemd/system/k3s.service" ]; then
|
||||||
mkdir -p "$(dirname "${k3s_dropin}")"
|
ensure_dropin \
|
||||||
printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_dropin}"
|
"${k3s_image_gc_dropin}" \
|
||||||
changed=1
|
"k3s" \
|
||||||
k3s_changed=1
|
"[Service]
|
||||||
|
Environment=\"K3S_KUBELET_ARG=image-gc-high-threshold=65\"
|
||||||
|
Environment=\"K3S_KUBELET_ARG=image-gc-low-threshold=50\"
|
||||||
|
Environment=\"K3S_KUBELET_ARG=image-gc-minimum-available=8Gi\""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -f "/host/etc/systemd/system/k3s.service" ] && [ ! -f "${k3s_image_gc_dropin}" ]; then
|
if [ -f "/host/etc/systemd/system/k3s-agent.service" ]; then
|
||||||
mkdir -p "$(dirname "${k3s_image_gc_dropin}")"
|
ensure_dropin \
|
||||||
printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_image_gc_dropin}"
|
"${k3s_agent_dropin}" \
|
||||||
changed=1
|
"k3s-agent" \
|
||||||
k3s_changed=1
|
"[Service]
|
||||||
|
Environment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"
|
||||||
|
Environment=\"K3S_KUBELET_ARG=container-log-max-files=2\""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_dropin}" ]; then
|
if [ -f "/host/etc/systemd/system/k3s-agent.service" ]; then
|
||||||
mkdir -p "$(dirname "${k3s_agent_dropin}")"
|
ensure_dropin \
|
||||||
printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-size=10Mi\"\nEnvironment=\"K3S_KUBELET_ARG=container-log-max-files=2\"\n" > "${k3s_agent_dropin}"
|
"${k3s_agent_image_gc_dropin}" \
|
||||||
changed=1
|
"k3s-agent" \
|
||||||
k3s_agent_changed=1
|
"[Service]
|
||||||
fi
|
Environment=\"K3S_KUBELET_ARG=image-gc-high-threshold=65\"
|
||||||
|
Environment=\"K3S_KUBELET_ARG=image-gc-low-threshold=50\"
|
||||||
if [ -f "/host/etc/systemd/system/k3s-agent.service" ] && [ ! -f "${k3s_agent_image_gc_dropin}" ]; then
|
Environment=\"K3S_KUBELET_ARG=image-gc-minimum-available=8Gi\""
|
||||||
mkdir -p "$(dirname "${k3s_agent_image_gc_dropin}")"
|
|
||||||
printf "[Service]\nEnvironment=\"K3S_KUBELET_ARG=image-gc-high-threshold=70\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-low-threshold=60\"\nEnvironment=\"K3S_KUBELET_ARG=image-gc-minimum-available=5Gi\"\n" > "${k3s_agent_image_gc_dropin}"
|
|
||||||
changed=1
|
|
||||||
k3s_agent_changed=1
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "${changed}" -eq 1 ]; then
|
if [ "${changed}" -eq 1 ]; then
|
||||||
|
|||||||
@ -18,6 +18,7 @@ spec:
|
|||||||
prometheus.io/scrape: "true"
|
prometheus.io/scrape: "true"
|
||||||
prometheus.io/port: "8080"
|
prometheus.io/port: "8080"
|
||||||
prometheus.io/path: "/metrics"
|
prometheus.io/path: "/metrics"
|
||||||
|
maintenance.bstein.dev/restart-rev: "20260207-2"
|
||||||
vault.hashicorp.com/agent-inject: "true"
|
vault.hashicorp.com/agent-inject: "true"
|
||||||
vault.hashicorp.com/role: "maintenance"
|
vault.hashicorp.com/role: "maintenance"
|
||||||
vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
|
vault.hashicorp.com/agent-inject-secret-ariadne-env.sh: "kv/data/atlas/maintenance/ariadne-db"
|
||||||
@ -105,7 +106,7 @@ spec:
|
|||||||
node-role.kubernetes.io/worker: "true"
|
node-role.kubernetes.io/worker: "true"
|
||||||
containers:
|
containers:
|
||||||
- name: ariadne
|
- name: ariadne
|
||||||
image: registry.bstein.dev/bstein/ariadne:0.1.0-0
|
image: registry.bstein.dev/bstein/ariadne:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
command: ["/bin/sh", "-c"]
|
command: ["/bin/sh", "-c"]
|
||||||
args:
|
args:
|
||||||
@ -285,7 +286,7 @@ spec:
|
|||||||
- name: ARIADNE_SCHEDULE_MAILU_SYNC
|
- name: ARIADNE_SCHEDULE_MAILU_SYNC
|
||||||
value: "30 4 * * *"
|
value: "30 4 * * *"
|
||||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
|
- name: ARIADNE_SCHEDULE_NEXTCLOUD_SYNC
|
||||||
value: "0 5 * * *"
|
value: "*/15 * * * *"
|
||||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
|
- name: ARIADNE_SCHEDULE_NEXTCLOUD_CRON
|
||||||
value: "*/5 * * * *"
|
value: "*/5 * * * *"
|
||||||
- name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
|
- name: ARIADNE_SCHEDULE_NEXTCLOUD_MAINTENANCE
|
||||||
@ -293,23 +294,23 @@ spec:
|
|||||||
- name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
|
- name: ARIADNE_SCHEDULE_VAULTWARDEN_SYNC
|
||||||
value: "0 * * * *"
|
value: "0 * * * *"
|
||||||
- name: ARIADNE_SCHEDULE_WGER_USER_SYNC
|
- name: ARIADNE_SCHEDULE_WGER_USER_SYNC
|
||||||
value: "0 5 * * *"
|
value: "*/15 * * * *"
|
||||||
- name: ARIADNE_SCHEDULE_WGER_ADMIN
|
- name: ARIADNE_SCHEDULE_WGER_ADMIN
|
||||||
value: "15 3 * * *"
|
value: "15 3 * * *"
|
||||||
- name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
|
- name: ARIADNE_SCHEDULE_FIREFLY_USER_SYNC
|
||||||
value: "0 6 * * *"
|
value: "*/15 * * * *"
|
||||||
- name: ARIADNE_SCHEDULE_FIREFLY_CRON
|
- name: ARIADNE_SCHEDULE_FIREFLY_CRON
|
||||||
value: "0 3 * * *"
|
value: "0 3 * * *"
|
||||||
- name: ARIADNE_SCHEDULE_POD_CLEANER
|
- name: ARIADNE_SCHEDULE_POD_CLEANER
|
||||||
value: "0 * * * *"
|
value: "*/30 * * * *"
|
||||||
- name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
|
- name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE
|
||||||
value: "23 3 * * *"
|
value: "23 3 * * *"
|
||||||
- name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
|
- name: ARIADNE_SCHEDULE_IMAGE_SWEEPER
|
||||||
value: "30 4 * * 0"
|
value: "0 */4 * * *"
|
||||||
- name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
|
- name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH
|
||||||
value: "0 * * * *"
|
value: "*/15 * * * *"
|
||||||
- name: ARIADNE_SCHEDULE_VAULT_OIDC
|
- name: ARIADNE_SCHEDULE_VAULT_OIDC
|
||||||
value: "0 * * * *"
|
value: "*/15 * * * *"
|
||||||
- name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
|
- name: ARIADNE_SCHEDULE_COMMS_GUEST_NAME
|
||||||
value: "*/5 * * * *"
|
value: "*/5 * * * *"
|
||||||
- name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
|
- name: ARIADNE_SCHEDULE_COMMS_PIN_INVITE
|
||||||
@ -319,9 +320,9 @@ spec:
|
|||||||
- name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
|
- name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM
|
||||||
value: "*/10 * * * *"
|
value: "*/10 * * * *"
|
||||||
- name: ARIADNE_SCHEDULE_CLUSTER_STATE
|
- name: ARIADNE_SCHEDULE_CLUSTER_STATE
|
||||||
value: "*/15 * * * *"
|
value: "*/10 * * * *"
|
||||||
- name: ARIADNE_CLUSTER_STATE_KEEP
|
- name: ARIADNE_CLUSTER_STATE_KEEP
|
||||||
value: "168"
|
value: "720"
|
||||||
- name: WELCOME_EMAIL_ENABLED
|
- name: WELCOME_EMAIL_ENABLED
|
||||||
value: "true"
|
value: "true"
|
||||||
- name: K8S_API_TIMEOUT_SEC
|
- name: K8S_API_TIMEOUT_SEC
|
||||||
@ -330,6 +331,8 @@ spec:
|
|||||||
value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
|
value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
|
||||||
- name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
|
- name: ARIADNE_CLUSTER_STATE_VM_TIMEOUT_SEC
|
||||||
value: "5"
|
value: "5"
|
||||||
|
- name: ARIADNE_ALERTMANAGER_URL
|
||||||
|
value: http://alertmanager.monitoring.svc.cluster.local
|
||||||
- name: OPENSEARCH_URL
|
- name: OPENSEARCH_URL
|
||||||
value: http://opensearch-master.logging.svc.cluster.local:9200
|
value: http://opensearch-master.logging.svc.cluster.local:9200
|
||||||
- name: OPENSEARCH_LIMIT_BYTES
|
- name: OPENSEARCH_LIMIT_BYTES
|
||||||
|
|||||||
@ -33,17 +33,15 @@ spec:
|
|||||||
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
|
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
|
||||||
env:
|
env:
|
||||||
- name: SWEEP_INTERVAL_SEC
|
- name: SWEEP_INTERVAL_SEC
|
||||||
value: "21600"
|
value: "7200"
|
||||||
- name: HIGH_USAGE_PERCENT
|
- name: HIGH_USAGE_PERCENT
|
||||||
value: "70"
|
value: "70"
|
||||||
- name: EMERGENCY_USAGE_PERCENT
|
- name: EMERGENCY_USAGE_PERCENT
|
||||||
value: "80"
|
value: "80"
|
||||||
- name: BASE_THRESHOLD_DAYS
|
|
||||||
value: "14"
|
|
||||||
- name: HIGH_USAGE_THRESHOLD_DAYS
|
|
||||||
value: "3"
|
|
||||||
- name: LOG_RETENTION_DAYS
|
- name: LOG_RETENTION_DAYS
|
||||||
value: "7"
|
value: "7"
|
||||||
|
- name: ORPHAN_POD_RETENTION_DAYS
|
||||||
|
value: "3"
|
||||||
- name: JOURNAL_MAX_SIZE
|
- name: JOURNAL_MAX_SIZE
|
||||||
value: "200M"
|
value: "200M"
|
||||||
securityContext:
|
securityContext:
|
||||||
|
|||||||
@ -3,96 +3,71 @@ set -eu
|
|||||||
|
|
||||||
ONE_SHOT=${ONE_SHOT:-false}
|
ONE_SHOT=${ONE_SHOT:-false}
|
||||||
SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
|
SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
|
||||||
BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
|
|
||||||
HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
|
|
||||||
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
|
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
|
||||||
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
|
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
|
||||||
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
|
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
|
||||||
|
ORPHAN_POD_RETENTION_DAYS=${ORPHAN_POD_RETENTION_DAYS:-3}
|
||||||
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
|
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
|
||||||
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
|
|
||||||
|
cleanup_orphaned_hdd_pod_logs() {
|
||||||
|
if [ ! -d /host/var/log.hdd/pods ]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
ORPHAN_POD_RETENTION_DAYS="${ORPHAN_POD_RETENTION_DAYS}" python3 - <<'PY'
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import time
|
||||||
|
|
||||||
|
hdd_pods = "/host/var/log.hdd/pods"
|
||||||
|
active_pods = "/host/var/log/pods"
|
||||||
|
retention_days = int(os.environ.get("ORPHAN_POD_RETENTION_DAYS", "3"))
|
||||||
|
cutoff = time.time() - (retention_days * 86400)
|
||||||
|
|
||||||
|
try:
|
||||||
|
active_names = set(os.listdir(active_pods))
|
||||||
|
except Exception:
|
||||||
|
active_names = set()
|
||||||
|
|
||||||
|
try:
|
||||||
|
hdd_names = os.listdir(hdd_pods)
|
||||||
|
except Exception:
|
||||||
|
hdd_names = []
|
||||||
|
|
||||||
|
for name in hdd_names:
|
||||||
|
path = os.path.join(hdd_pods, name)
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
continue
|
||||||
|
if name in active_names:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
mtime = os.path.getmtime(path)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if mtime > cutoff:
|
||||||
|
continue
|
||||||
|
print(path)
|
||||||
|
shutil.rmtree(path, ignore_errors=True)
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
sweep_once() {
|
sweep_once() {
|
||||||
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
|
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
|
||||||
threshold_days="${BASE_THRESHOLD_DAYS}"
|
|
||||||
|
# crictl image metadata frequently omits createdAt on this cluster; prune by
|
||||||
|
# runtime reachability whenever rootfs crosses pressure thresholds.
|
||||||
if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
|
if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
|
||||||
threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
|
chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
|
cleanup_orphaned_hdd_pod_logs
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
days = int(os.environ.get("THRESHOLD_DAYS", "14"))
|
if [ -d /host/var/log.hdd/pods ]; then
|
||||||
print(int(time.time()) - days * 86400)
|
find /host/var/log.hdd/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||||
PY
|
fi
|
||||||
)
|
|
||||||
|
|
||||||
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
|
if [ -d /host/var/log.hdd/containers ]; then
|
||||||
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
|
find /host/var/log.hdd/containers -xtype l -print -delete 2>/dev/null || true
|
||||||
|
|
||||||
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
|
|
||||||
try:
|
|
||||||
data = json.load(sys.stdin)
|
|
||||||
except Exception:
|
|
||||||
print("", end="")
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
cutoff = int(os.environ.get("CUTOFF", "0"))
|
|
||||||
running = set(os.environ.get("RUNNING", "").split())
|
|
||||||
skip = os.environ.get("SKIP", "").split()
|
|
||||||
now = int(time.time())
|
|
||||||
prune = []
|
|
||||||
|
|
||||||
|
|
||||||
def is_skip(tags):
|
|
||||||
if not tags:
|
|
||||||
return False
|
|
||||||
for t in tags:
|
|
||||||
for prefix in skip:
|
|
||||||
if prefix and t.startswith(prefix):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
for img in data.get("images", []):
|
|
||||||
image_id = img.get("id", "")
|
|
||||||
if not image_id:
|
|
||||||
continue
|
|
||||||
if image_id in running:
|
|
||||||
continue
|
|
||||||
tags = img.get("repoTags") or []
|
|
||||||
if is_skip(tags):
|
|
||||||
continue
|
|
||||||
created = img.get("createdAt") or 0
|
|
||||||
try:
|
|
||||||
created = int(str(created)) // 1000000000
|
|
||||||
except Exception:
|
|
||||||
created = 0
|
|
||||||
if created and created > now:
|
|
||||||
created = now
|
|
||||||
if cutoff and created and created < cutoff:
|
|
||||||
prune.append(image_id)
|
|
||||||
|
|
||||||
seen = set()
|
|
||||||
for p in prune:
|
|
||||||
if p in seen:
|
|
||||||
continue
|
|
||||||
seen.add(p)
|
|
||||||
print(p)
|
|
||||||
PY
|
|
||||||
)
|
|
||||||
|
|
||||||
if [ -n "${prune_list}" ]; then
|
|
||||||
printf "%s" "${prune_list}" | while read -r image_id; do
|
|
||||||
if [ -n "${image_id}" ]; then
|
|
||||||
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
|
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
|
||||||
@ -100,9 +75,11 @@ PY
|
|||||||
|
|
||||||
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
|
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
|
||||||
# Emergency pass for rootfs pressure on SD-backed nodes.
|
# Emergency pass for rootfs pressure on SD-backed nodes.
|
||||||
|
chroot /host /bin/sh -c "crictl rmi --prune >/dev/null 2>&1 || true"
|
||||||
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
|
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
|
||||||
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||||
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||||
|
find /host/var/log.hdd -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
|
||||||
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
|
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|||||||
@ -303,8 +303,56 @@ data:
|
|||||||
summary: "node-image-sweeper not fully ready"
|
summary: "node-image-sweeper not fully ready"
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
- uid: logging-node-log-rotation-not-ready
|
||||||
|
title: "Node log rotation guardrails not ready"
|
||||||
|
condition: C
|
||||||
|
for: "10m"
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
datasourceUid: atlas-vm
|
||||||
|
model:
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
expr: kube_daemonset_status_number_ready{namespace="logging",daemonset="node-log-rotation"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="logging",daemonset="node-log-rotation"}
|
||||||
|
legendFormat: '{{daemonset}}'
|
||||||
|
datasource:
|
||||||
|
type: prometheus
|
||||||
|
uid: atlas-vm
|
||||||
|
- refId: B
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: A
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
reducer: last
|
||||||
|
type: reduce
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
expression: B
|
||||||
|
intervalMs: 60000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
type: threshold
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params: [1]
|
||||||
|
type: lt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
reducer:
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
noDataState: NoData
|
||||||
|
execErrState: Error
|
||||||
|
annotations:
|
||||||
|
summary: "node-log-rotation is not fully ready"
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
- uid: maint-ariadne-image-sweeper-stale
|
- uid: maint-ariadne-image-sweeper-stale
|
||||||
title: "Ariadne image sweeper stale (schedule >8d)"
|
title: "Ariadne image sweeper stale (schedule >24h)"
|
||||||
condition: C
|
condition: C
|
||||||
for: "5m"
|
for: "5m"
|
||||||
data:
|
data:
|
||||||
@ -338,7 +386,7 @@ data:
|
|||||||
type: threshold
|
type: threshold
|
||||||
conditions:
|
conditions:
|
||||||
- evaluator:
|
- evaluator:
|
||||||
params: [691200]
|
params: [86400]
|
||||||
type: gt
|
type: gt
|
||||||
operator:
|
operator:
|
||||||
type: and
|
type: and
|
||||||
@ -348,7 +396,7 @@ data:
|
|||||||
noDataState: OK
|
noDataState: OK
|
||||||
execErrState: Error
|
execErrState: Error
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Ariadne image sweeper stale >8d since last success"
|
summary: "Ariadne image sweeper stale >24h since last success"
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- uid: maint-cron-stale
|
- uid: maint-cron-stale
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user