Compare commits

..

No commits in common. "3ea296b552ae970594cd5cb742ad18a7c65f02cb" and "131c34012bd59af402428add9eba5f8113a3dec0" have entirely different histories.

11 changed files with 39 additions and 856 deletions

View File

@ -35,7 +35,6 @@ data:
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
PUBLIC_FOLDER = "overview"
PRIVATE_FOLDER = "atlas-internal"
ASTRAIOS_MOUNTPOINT = "/mnt/astraios"
PERCENT_THRESHOLDS = {
"mode": "absolute",
@ -157,10 +156,6 @@ def root_usage_expr(scope=""):
return filesystem_usage_expr("/", scope)
def astraios_usage_expr(scope=""):
return filesystem_usage_expr(ASTRAIOS_MOUNTPOINT, scope)
def astreae_usage_expr(mount):
return (
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
@ -475,7 +470,6 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
)
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))"
ANANKE_SELECTOR = 'job="ananke-power"'
ANANKE_UPS_DB_NAME = "Pyrphoros"
ANANKE_UPS_DB_NODE = "titan-db"
@ -1546,27 +1540,26 @@ def build_overview():
panels.append(
bargauge_panel(
47,
"PVC Backup Health / Age",
PVC_BACKUP_AGE_HOURS_BY_PVC,
"Platform Suite Pass Rate (24h)",
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE,
{"h": 5, "w": 6, "x": 18, "y": 7},
unit="h",
unit="percent",
instant=True,
legend="{{namespace}}/{{pvc}}",
legend="{{suite}}",
sort_order="desc",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 12},
{"color": "red", "value": 24},
{"color": "red", "value": None},
{"color": "yellow", "value": 80},
{"color": "green", "value": 95},
],
},
)
)
panels[-1]["links"] = link_to("atlas-storage")
panels[-1]["links"] = link_to("atlas-jobs")
panels[-1]["description"] = (
"Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
"24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
)
panels.append(
@ -1854,8 +1847,8 @@ def build_overview():
panels.append(
bargauge_panel(
22,
"Nodes Closest to Full Astraios Disks",
f"topk(12, {astraios_usage_expr()})",
"Nodes Closest to Full Root Disks",
f"topk(12, {root_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 71},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
@ -2228,19 +2221,6 @@ def build_nodes_dashboard():
time_from="30d",
)
)
panels.append(
timeseries_panel(
9,
"Astraios Usage",
astraios_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 44},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
return {
"uid": "atlas-nodes",
"title": "Atlas Nodes",

View File

@ -3,7 +3,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- pi-usb-scratch-configmap.yaml
- image.yaml
- secretproviderclass.yaml
- metis-configmap.yaml
@ -19,9 +18,7 @@ resources:
- metis-rbac.yaml
- metis-token-sync-serviceaccount.yaml
- node-nofile-serviceaccount.yaml
- pi-usb-scratch-serviceaccount.yaml
- pod-cleaner-rbac.yaml
- pi-usb-scratch-rbac.yaml
- ariadne-deployment.yaml
- metis-deployment.yaml
- oneoffs/ariadne-migrate-job.yaml
@ -29,7 +26,6 @@ resources:
- disable-k3s-traefik-daemonset.yaml
- oneoffs/k3s-traefik-cleanup-job.yaml
- node-nofile-daemonset.yaml
- pi-usb-scratch-daemonset.yaml
- metis-sentinel-amd64-daemonset.yaml
- metis-sentinel-arm64-daemonset.yaml
- metis-k3s-token-sync-cronjob.yaml
@ -78,9 +74,3 @@ configMapGenerator:
- node_image_sweeper.sh=scripts/node_image_sweeper.sh
options:
disableNameSuffixHash: true
- name: pi-usb-scratch-script
namespace: maintenance
files:
- pi_usb_scratch.sh=scripts/pi_usb_scratch.sh
options:
disableNameSuffixHash: true

View File

@ -1,48 +0,0 @@
# services/maintenance/pi-usb-scratch-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: pi-usb-scratch-config
namespace: maintenance
data:
usb_scratch.env: |
USB_SCRATCH_DEFAULT_ENABLED=true
# Leave empty to avoid label-based fallback selection.
USB_SCRATCH_DEFAULT_LABEL=
USB_SCRATCH_DEFAULT_FSTYPE=ext4
USB_SCRATCH_MOUNTPOINT=/mnt/astraios
# Auto-select the removable 64GB USB partition on each worker.
USB_SCRATCH_AUTO_SELECT_REMOVABLE=true
USB_SCRATCH_AUTO_MIN_SIZE_GIB=50
# One-time bootstrap for new sticks that ship exfat/fat32.
USB_SCRATCH_AUTO_FORMAT_REMOVABLE=true
USB_SCRATCH_AUTO_FORMAT_LABEL=astraios
# Keep /tmp in RAM to reduce SD-card writes.
USB_SCRATCH_ENFORCE_TMPFS_TMP=true
USB_SCRATCH_REQUIRED_FREE_GIB=20
USB_SCRATCH_RECONCILE_INTERVAL_SEC=900
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=900
usb_scratch_inventory.tsv: |
# node_name enabled match_kind match_value fstype
# match_kind: uuid | label | device
# Astraios policy:
# - use UUID entries per worker node (preferred)
# - avoid shared labels to prevent accidental wrong-device mounts
# - mountpoint is /mnt/astraios on every worker node
# Example:
# titan-04 true uuid 11111111-2222-3333-4444-555555555555 ext4
# titan-05 true uuid <uuid-for-titan-05-astraios> ext4
# titan-06 true uuid <uuid-for-titan-06-astraios> ext4
# titan-07 true uuid <uuid-for-titan-07-astraios> ext4
# titan-08 true uuid <uuid-for-titan-08-astraios> ext4
# titan-09 true uuid <uuid-for-titan-09-astraios> ext4
# titan-10 true uuid <uuid-for-titan-10-astraios> ext4
# titan-11 true uuid <uuid-for-titan-11-astraios> ext4
# titan-12 true uuid <uuid-for-titan-12-astraios> ext4
# titan-13 true uuid <uuid-for-titan-13-astraios> ext4
# titan-14 true uuid <uuid-for-titan-14-astraios> ext4
# titan-15 true uuid <uuid-for-titan-15-astraios> ext4
# titan-16 true uuid <uuid-for-titan-16-astraios> ext4
# titan-17 true uuid <uuid-for-titan-17-astraios> ext4
# titan-18 true uuid <uuid-for-titan-18-astraios> ext4
# titan-19 true uuid <uuid-for-titan-19-astraios> ext4

View File

@ -1,68 +0,0 @@
# services/maintenance/pi-usb-scratch-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: pi-usb-scratch
namespace: maintenance
spec:
selector:
matchLabels:
app: pi-usb-scratch
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
app: pi-usb-scratch
spec:
serviceAccountName: pi-usb-scratch
hostPID: true
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi4
- rpi5
containers:
- name: pi-usb-scratch
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/usr/bin/env", "bash"]
args: ["/scripts/pi_usb_scratch.sh"]
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
privileged: true
runAsUser: 0
volumeMounts:
- name: host-root
mountPath: /host
- name: script
mountPath: /scripts
readOnly: true
- name: config
mountPath: /config
readOnly: true
volumes:
- name: host-root
hostPath:
path: /
- name: script
configMap:
name: pi-usb-scratch-script
defaultMode: 0555
- name: config
configMap:
name: pi-usb-scratch-config
defaultMode: 0444

View File

@ -1,26 +0,0 @@
# services/maintenance/pi-usb-scratch-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: pi-usb-scratch
rules:
- apiGroups: [""]
resources:
- nodes
verbs:
- get
- list
- patch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: pi-usb-scratch
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: pi-usb-scratch
subjects:
- kind: ServiceAccount
name: pi-usb-scratch
namespace: maintenance

View File

@ -1,6 +0,0 @@
# services/maintenance/pi-usb-scratch-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: pi-usb-scratch
namespace: maintenance

View File

@ -1,555 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
NODE_NAME=${NODE_NAME:?NODE_NAME is required}
HOST_ROOT=${HOST_ROOT:-/host}
CONFIG_ENV=${CONFIG_ENV:-/config/usb_scratch.env}
INVENTORY_FILE=${INVENTORY_FILE:-/config/usb_scratch_inventory.tsv}
FSTAB_PATH="${HOST_ROOT}/etc/fstab"
STATE_DIR="${HOST_ROOT}/var/lib/maintenance/pi-usb-scratch"
MANAGED_BEGIN="# BEGIN maintenance.bstein.dev usb-scratch"
MANAGED_END="# END maintenance.bstein.dev usb-scratch"
ONE_SHOT=${ONE_SHOT:-false}
DEFAULT_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
USB_SCRATCH_DEFAULT_ENABLED=${USB_SCRATCH_DEFAULT_ENABLED:-true}
USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-}
USB_SCRATCH_DEFAULT_FSTYPE=${USB_SCRATCH_DEFAULT_FSTYPE:-ext4}
USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/astraios}
USB_SCRATCH_ENFORCE_TMPFS_TMP=${USB_SCRATCH_ENFORCE_TMPFS_TMP:-true}
USB_SCRATCH_AUTO_SELECT_REMOVABLE=${USB_SCRATCH_AUTO_SELECT_REMOVABLE:-true}
USB_SCRATCH_AUTO_MIN_SIZE_GIB=${USB_SCRATCH_AUTO_MIN_SIZE_GIB:-50}
USB_SCRATCH_AUTO_FORMAT_REMOVABLE=${USB_SCRATCH_AUTO_FORMAT_REMOVABLE:-true}
USB_SCRATCH_AUTO_FORMAT_LABEL=${USB_SCRATCH_AUTO_FORMAT_LABEL:-astraios}
USB_SCRATCH_REQUIRED_FREE_GIB=${USB_SCRATCH_REQUIRED_FREE_GIB:-20}
USB_SCRATCH_RECONCILE_INTERVAL_SEC=${USB_SCRATCH_RECONCILE_INTERVAL_SEC:-900}
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC:-900}
TARGET_PATHS=(
"/var/log/pods"
"/var/log/containers"
"/var/lib/rancher/k3s/agent/containerd"
"/var/lib/rancher/k3s/agent/kubelet"
"/var/lib/rancher/k3s/agent/images"
"/var/tmp"
)
agent_stopped=0
log() {
printf '[%s] %s\n' "$(date -u +%FT%TZ)" "$*"
}
sanitize_annotation_value() {
printf '%s' "$1" | tr ' ' '_' | tr -cd '[:alnum:]._:/=-'
}
annotate_node() {
local status="$1"
local detail="$2"
local selector="$3"
local timestamp
timestamp="$(date -u +%FT%TZ)"
kubectl annotate --overwrite node "${NODE_NAME}" \
maintenance.bstein.dev/astraios-status="$(sanitize_annotation_value "${status}")" \
maintenance.bstein.dev/astraios-detail="$(sanitize_annotation_value "${detail}")" \
maintenance.bstein.dev/astraios-selector="$(sanitize_annotation_value "${selector}")" \
maintenance.bstein.dev/astraios-mountpoint="$(sanitize_annotation_value "${USB_SCRATCH_MOUNTPOINT}")" \
maintenance.bstein.dev/astraios-managed-paths="$(sanitize_annotation_value "${TARGET_PATHS[*]}")" \
maintenance.bstein.dev/astraios-last-apply="${timestamp}" \
maintenance.bstein.dev/usb-scratch-status="$(sanitize_annotation_value "${status}")" \
maintenance.bstein.dev/usb-scratch-detail="$(sanitize_annotation_value "${detail}")" \
maintenance.bstein.dev/usb-scratch-selector="$(sanitize_annotation_value "${selector}")" \
maintenance.bstein.dev/usb-scratch-mountpoint="$(sanitize_annotation_value "${USB_SCRATCH_MOUNTPOINT}")" \
maintenance.bstein.dev/usb-scratch-managed-paths="$(sanitize_annotation_value "${TARGET_PATHS[*]}")" \
maintenance.bstein.dev/usb-scratch-last-apply="${timestamp}" \
>/dev/null 2>&1 || true
}
host_sh() {
local command="$1"
if command -v nsenter >/dev/null 2>&1; then
nsenter -t 1 -m -u -i -n -p -- /bin/sh -ceu "PATH=${DEFAULT_PATH}; ${command}"
elif [ -x "${HOST_ROOT}/usr/bin/nsenter" ]; then
"${HOST_ROOT}/usr/bin/nsenter" -t 1 -m -u -i -n -p -- /bin/sh -ceu "PATH=${DEFAULT_PATH}; ${command}"
elif [ -x "${HOST_ROOT}/bin/nsenter" ]; then
"${HOST_ROOT}/bin/nsenter" -t 1 -m -u -i -n -p -- /bin/sh -ceu "PATH=${DEFAULT_PATH}; ${command}"
else
chroot "${HOST_ROOT}" /bin/sh -ceu "PATH=${DEFAULT_PATH}; ${command}"
fi
}
cleanup() {
if [ "${agent_stopped}" -eq 1 ]; then
log "starting k3s-agent after interrupted cutover"
host_sh "systemctl start k3s-agent || true"
agent_stopped=0
fi
}
trap cleanup EXIT
load_config() {
if [ -f "${CONFIG_ENV}" ]; then
# shellcheck disable=SC1090
. "${CONFIG_ENV}"
fi
USB_SCRATCH_DEFAULT_ENABLED=${USB_SCRATCH_DEFAULT_ENABLED:-true}
USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-}
USB_SCRATCH_DEFAULT_FSTYPE=${USB_SCRATCH_DEFAULT_FSTYPE:-ext4}
USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/astraios}
USB_SCRATCH_ENFORCE_TMPFS_TMP=${USB_SCRATCH_ENFORCE_TMPFS_TMP:-true}
USB_SCRATCH_AUTO_SELECT_REMOVABLE=${USB_SCRATCH_AUTO_SELECT_REMOVABLE:-true}
USB_SCRATCH_AUTO_MIN_SIZE_GIB=${USB_SCRATCH_AUTO_MIN_SIZE_GIB:-50}
USB_SCRATCH_AUTO_FORMAT_REMOVABLE=${USB_SCRATCH_AUTO_FORMAT_REMOVABLE:-true}
USB_SCRATCH_AUTO_FORMAT_LABEL=${USB_SCRATCH_AUTO_FORMAT_LABEL:-astraios}
USB_SCRATCH_REQUIRED_FREE_GIB=${USB_SCRATCH_REQUIRED_FREE_GIB:-20}
USB_SCRATCH_RECONCILE_INTERVAL_SEC=${USB_SCRATCH_RECONCILE_INTERVAL_SEC:-900}
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC:-900}
}
lookup_inventory() {
local line=""
if [ -f "${INVENTORY_FILE}" ]; then
line="$(awk -v node="${NODE_NAME}" 'NF >= 4 && $1 !~ /^#/ && $1 == node { print; exit }' "${INVENTORY_FILE}" || true)"
fi
printf '%s' "${line}"
}
strip_managed_block() {
local source_file="$1"
awk -v begin="${MANAGED_BEGIN}" -v end="${MANAGED_END}" '
$0 == begin { skip=1; next }
$0 == end { skip=0; next }
skip != 1 { print }
' "${source_file}"
}
ensure_fstab_block() {
local selector="$1"
local fstype="$2"
local tmp_base tmp_candidate bind_source target
mkdir -p "${STATE_DIR}"
tmp_base="${STATE_DIR}/fstab.base"
tmp_candidate="${STATE_DIR}/fstab.candidate"
strip_managed_block "${FSTAB_PATH}" > "${tmp_base}"
if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then
awk '$1 ~ /^#/ || $2 != "/tmp" { print }' "${tmp_base}" > "${tmp_base}.tmpfs"
mv "${tmp_base}.tmpfs" "${tmp_base}"
fi
cp "${tmp_base}" "${tmp_candidate}"
{
printf '%s\n' "${MANAGED_BEGIN}"
printf '%s %s %s defaults,noatime,lazytime,commit=60,x-systemd.device-timeout=15s,x-systemd.mount-timeout=30s 0 2\n' \
"${selector}" "${USB_SCRATCH_MOUNTPOINT}" "${fstype}"
if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then
printf '%s\n' 'tmpfs /tmp tmpfs defaults,nosuid,nodev,mode=1777 0 0'
fi
for target in "${TARGET_PATHS[@]}"; do
bind_source="${USB_SCRATCH_MOUNTPOINT}${target}"
printf '%s %s none bind,x-systemd.requires-mounts-for=%s 0 0\n' \
"${bind_source}" "${target}" "${USB_SCRATCH_MOUNTPOINT}"
done
printf '%s\n' "${MANAGED_END}"
} >> "${tmp_candidate}"
if ! cmp -s "${FSTAB_PATH}" "${tmp_candidate}"; then
cp "${tmp_candidate}" "${FSTAB_PATH}"
log "updated ${FSTAB_PATH} managed block"
return 0
fi
return 1
}
ensure_k3s_agent_guard() {
local dropin_dir dropin_file guard_dir guard_file target verify_cmd requires_mounts
local tmp_dropin
dropin_dir="${HOST_ROOT}/etc/systemd/system/k3s-agent.service.d"
dropin_file="${dropin_dir}/20-astraios-guard.conf"
guard_dir="${HOST_ROOT}/usr/local/lib/maintenance"
guard_file="${guard_dir}/verify_astraios_mounts.sh"
tmp_dropin="${STATE_DIR}/k3s-agent-astraios-dropin.conf"
mkdir -p "${dropin_dir}" "${guard_dir}" "${STATE_DIR}"
cat > "${guard_file}" <<EOF
#!/usr/bin/env bash
set -euo pipefail
mountpoint -q '${USB_SCRATCH_MOUNTPOINT}'
EOF
for target in "${TARGET_PATHS[@]}"; do
cat >> "${guard_file}" <<EOF
src=\$(findmnt -T '${target}' -n -o SOURCE 2>/dev/null || true)
if [[ "\${src}" != '${USB_SCRATCH_MOUNTPOINT}${target}' ]]; then
echo "astraios guard: ${target} is not bound to ${USB_SCRATCH_MOUNTPOINT}${target}" >&2
exit 1
fi
EOF
done
chmod 0755 "${guard_file}"
requires_mounts="${USB_SCRATCH_MOUNTPOINT}"
for target in "${TARGET_PATHS[@]}"; do
requires_mounts="${requires_mounts} ${target}"
done
verify_cmd="${guard_file#${HOST_ROOT}}"
cat > "${tmp_dropin}" <<EOF
[Unit]
RequiresMountsFor=${requires_mounts}
After=local-fs.target
[Service]
ExecStartPre=${verify_cmd}
EOF
if [ ! -f "${dropin_file}" ] || ! cmp -s "${dropin_file}" "${tmp_dropin}"; then
cp "${tmp_dropin}" "${dropin_file}"
log "updated k3s-agent Astraios guard drop-in"
return 0
fi
return 1
}
tmp_is_tmpfs() {
local fstype
fstype="$(host_sh "findmnt -T /tmp -n -o FSTYPE 2>/dev/null || true")"
[ "${fstype}" = "tmpfs" ]
}
ensure_tmp_tmpfs_live() {
host_sh "mkdir -p /tmp; chmod 1777 /tmp; fstype=\$(findmnt -T /tmp -n -o FSTYPE 2>/dev/null || true); if [ \"\${fstype}\" != \"tmpfs\" ]; then mount /tmp 2>/dev/null || mount -t tmpfs -o defaults,nosuid,nodev,mode=1777 tmpfs /tmp; fi"
}
find_existing_mount_source() {
local target="$1"
host_sh "findmnt -T '${target}' -n -o SOURCE 2>/dev/null || true"
}
auto_discover_removable_partition() {
local min_bytes
min_bytes=$(( USB_SCRATCH_AUTO_MIN_SIZE_GIB * 1024 * 1024 * 1024 ))
host_sh "lsblk -brnpo NAME,TYPE,SIZE,RM | awk '\$2==\"part\" && \$4==\"1\" && \$3>=${min_bytes} {print \$1; exit}'"
}
format_device_ext4() {
local device="$1"
local label="$2"
host_sh "mountpoint=\$(findmnt -S '${device}' -n -o TARGET 2>/dev/null || true); if [ -n \"\${mountpoint}\" ]; then umount \"\${mountpoint}\"; fi; wipefs -a '${device}'; mkfs.ext4 -F -L '${label}' '${device}'"
}
resolve_selector() {
local inventory_line enabled kind value fstype actual_device actual_fstype actual_uuid actual_label selector expected_fstype
inventory_line="$(lookup_inventory)"
enabled="${USB_SCRATCH_DEFAULT_ENABLED}"
kind=""
value=""
fstype="${USB_SCRATCH_DEFAULT_FSTYPE}"
if [ -n "${inventory_line}" ]; then
read -r _ enabled kind value fstype _ <<<"${inventory_line}"
elif [ -n "${USB_SCRATCH_DEFAULT_LABEL}" ]; then
kind="label"
value="${USB_SCRATCH_DEFAULT_LABEL}"
elif [ "${USB_SCRATCH_AUTO_SELECT_REMOVABLE}" = "true" ]; then
kind="auto"
value="removable-${USB_SCRATCH_AUTO_MIN_SIZE_GIB}Gi-plus"
fi
if [ "${enabled}" != "true" ]; then
SELECTOR_KIND="disabled"
SELECTOR_VALUE=""
SELECTOR_SPEC=""
DEVICE_PATH=""
DEVICE_FSTYPE="${fstype}"
return 0
fi
if [ -z "${kind}" ] || [ -z "${value}" ]; then
SELECTOR_KIND="missing"
SELECTOR_VALUE=""
SELECTOR_SPEC=""
DEVICE_PATH=""
DEVICE_FSTYPE="${fstype}"
return 0
fi
case "${kind}" in
uuid)
selector="UUID=${value}"
actual_device="$(host_sh "blkid -U '${value}' 2>/dev/null || true")"
;;
label)
selector="LABEL=${value}"
actual_device="$(host_sh "blkid -L '${value}' 2>/dev/null || true")"
;;
device)
selector="${value}"
actual_device="$(host_sh "if [ -b '${value}' ]; then printf '%s' '${value}'; fi")"
;;
auto)
actual_device="$(auto_discover_removable_partition)"
selector="${actual_device}"
;;
*)
SELECTOR_KIND="invalid"
SELECTOR_VALUE="${value}"
SELECTOR_SPEC=""
DEVICE_PATH=""
DEVICE_FSTYPE="${fstype}"
return 0
;;
esac
actual_fstype=""
actual_uuid=""
actual_label=""
if [ "${kind}" = "auto" ] && [ -z "${actual_device}" ]; then
SELECTOR_KIND="missing"
SELECTOR_VALUE="${value}"
SELECTOR_SPEC=""
DEVICE_PATH=""
DEVICE_FSTYPE="${fstype}"
SELECTOR_MATCH_KIND="${kind}"
return 0
fi
if [ -n "${actual_device}" ]; then
actual_fstype="$(host_sh "blkid -o value -s TYPE '${actual_device}' 2>/dev/null || true")"
actual_uuid="$(host_sh "blkid -o value -s UUID '${actual_device}' 2>/dev/null || true")"
actual_label="$(host_sh "blkid -o value -s LABEL '${actual_device}' 2>/dev/null || true")"
fi
if [ "${kind}" = "auto" ] && [ -n "${actual_uuid}" ]; then
selector="UUID=${actual_uuid}"
fi
expected_fstype="${fstype:-${USB_SCRATCH_DEFAULT_FSTYPE}}"
SELECTOR_MATCH_KIND="${kind}"
if [ -n "${actual_fstype}" ] && [ -n "${expected_fstype}" ] && [ "${actual_fstype}" != "${expected_fstype}" ]; then
SELECTOR_KIND="fs-mismatch"
SELECTOR_VALUE="${selector}"
SELECTOR_SPEC="${selector}"
DEVICE_PATH="${actual_device}"
DEVICE_FSTYPE="${actual_fstype}"
return 0
fi
SELECTOR_KIND="${kind}"
SELECTOR_VALUE="${value}"
SELECTOR_SPEC="${selector}"
DEVICE_PATH="${actual_device}"
DEVICE_FSTYPE="${expected_fstype}"
DEVICE_UUID="${actual_uuid}"
DEVICE_LABEL="${actual_label}"
}
ensure_directories() {
local target source_dir
mkdir -p "${STATE_DIR}" "${HOST_ROOT}${USB_SCRATCH_MOUNTPOINT}"
for target in "${TARGET_PATHS[@]}"; do
mkdir -p "${HOST_ROOT}${target}" "${HOST_ROOT}${USB_SCRATCH_MOUNTPOINT}${target}"
done
}
ensure_usb_mount_live() {
local existing_source
existing_source="$(find_existing_mount_source "${USB_SCRATCH_MOUNTPOINT}")"
if [ -n "${existing_source}" ] && [ -n "${DEVICE_PATH}" ] && [ "${existing_source}" != "${DEVICE_PATH}" ] && [ "${existing_source}" != "${SELECTOR_SPEC}" ]; then
log "usb scratch already mounted from unexpected source ${existing_source}"
return 1
fi
host_sh "mkdir -p '${USB_SCRATCH_MOUNTPOINT}'; mountpoint -q '${USB_SCRATCH_MOUNTPOINT}' || mount '${USB_SCRATCH_MOUNTPOINT}'"
return 0
}
free_space_gib() {
host_sh "df -Pk '${USB_SCRATCH_MOUNTPOINT}' | awk 'NR==2 { printf \"%.0f\", \$4 / 1024 / 1024 }'"
}
target_bound_to_scratch() {
local target="$1"
local current_source
current_source="$(find_existing_mount_source "${target}")"
[ "${current_source}" = "${USB_SCRATCH_MOUNTPOINT}${target}" ]
}
seed_target_data() {
local target="$1"
local source_dir
source_dir="${USB_SCRATCH_MOUNTPOINT}${target}"
host_sh "mkdir -p '${source_dir}' '${target}'; if command -v rsync >/dev/null 2>&1; then rsync -aHAX --numeric-ids '${target}/' '${source_dir}/'; else tar -C '${target}' -cf - . | tar -C '${source_dir}' -xf -; fi"
}
mount_target_live() {
local target="$1"
host_sh "mountpoint -q '${target}' || mount '${target}'"
}
cutover_needed() {
local target
for target in "${TARGET_PATHS[@]}"; do
if ! target_bound_to_scratch "${target}"; then
return 0
fi
done
return 1
}
perform_cutover() {
local jitter target
if ! cutover_needed; then
return 0
fi
jitter=0
if [ "${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC}" -gt 0 ]; then
jitter=$(( RANDOM % (USB_SCRATCH_CUTOVER_JITTER_MAX_SEC + 1) ))
fi
if [ "${jitter}" -gt 0 ]; then
log "sleeping ${jitter}s before first live cutover"
sleep "${jitter}"
fi
log "stopping k3s-agent for Astraios cutover"
host_sh "systemctl stop k3s-agent"
agent_stopped=1
for target in "${TARGET_PATHS[@]}"; do
if ! target_bound_to_scratch "${target}"; then
log "seeding ${target} into ${USB_SCRATCH_MOUNTPOINT}${target}"
seed_target_data "${target}"
log "mounting bind target ${target}"
mount_target_live "${target}"
fi
done
log "starting k3s-agent after Astraios cutover"
host_sh "systemctl start k3s-agent"
agent_stopped=0
}
reconcile_once() {
local fstab_changed=false guard_changed=false free_gib selector_detail tmp_detail
load_config
resolve_selector
ensure_directories
case "${SELECTOR_KIND}" in
disabled)
annotate_node "disabled" "inventory-disabled" "none"
log "inventory disables Astraios on ${NODE_NAME}"
return 0
;;
missing)
annotate_node "pending" "missing-inventory" "none"
log "no inventory entry or default selector for ${NODE_NAME}"
return 0
;;
invalid)
annotate_node "error" "invalid-selector" "${SELECTOR_VALUE}"
log "invalid selector configured for ${NODE_NAME}"
return 0
;;
fs-mismatch)
if [ "${USB_SCRATCH_AUTO_FORMAT_REMOVABLE}" = "true" ] && [ "${SELECTOR_MATCH_KIND:-}" = "auto" ] && [ -n "${DEVICE_PATH}" ]; then
log "formatting auto-discovered device ${DEVICE_PATH} as ext4 label=${USB_SCRATCH_AUTO_FORMAT_LABEL}"
if format_device_ext4 "${DEVICE_PATH}" "${USB_SCRATCH_AUTO_FORMAT_LABEL}"; then
resolve_selector
fi
fi
if [ "${SELECTOR_KIND}" = "fs-mismatch" ]; then
annotate_node "error" "filesystem-mismatch" "${SELECTOR_SPEC}"
log "filesystem mismatch on ${DEVICE_PATH}: expected ${USB_SCRATCH_DEFAULT_FSTYPE}, got ${DEVICE_FSTYPE}"
return 0
fi
;;
esac
selector_detail="${SELECTOR_SPEC}"
if ensure_fstab_block "${SELECTOR_SPEC}" "${DEVICE_FSTYPE}"; then
fstab_changed=true
host_sh "systemctl daemon-reload || true"
fi
if [ -z "${DEVICE_PATH}" ]; then
annotate_node "pending" "device-not-found" "${selector_detail}"
log "Astraios device not present yet for selector ${selector_detail}"
return 0
fi
if ! ensure_usb_mount_live; then
annotate_node "error" "mount-conflict" "${selector_detail}"
return 0
fi
free_gib="$(free_space_gib || true)"
if [ -z "${free_gib}" ]; then
annotate_node "error" "free-space-check-failed" "${selector_detail}"
return 0
fi
if [ "${free_gib}" -lt "${USB_SCRATCH_REQUIRED_FREE_GIB}" ]; then
annotate_node "error" "insufficient-free-space-${free_gib}Gi" "${selector_detail}"
log "Astraios free space ${free_gib}Gi below required ${USB_SCRATCH_REQUIRED_FREE_GIB}Gi"
return 0
fi
if ensure_k3s_agent_guard; then
guard_changed=true
host_sh "systemctl daemon-reload || true"
fi
if host_sh "systemctl list-unit-files | grep -q '^k3s-agent.service'"; then
perform_cutover
else
annotate_node "error" "missing-k3s-agent-service" "${selector_detail}"
log "k3s-agent.service missing on ${NODE_NAME}"
return 0
fi
if cutover_needed; then
annotate_node "error" "bind-mount-incomplete" "${selector_detail}"
return 0
fi
tmp_detail="tmpfs-ok"
if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then
if ! ensure_tmp_tmpfs_live || ! tmp_is_tmpfs; then
annotate_node "error" "tmpfs-tmp-enforce-failed" "${selector_detail}"
log "failed to enforce /tmp tmpfs on ${NODE_NAME}"
return 0
fi
elif ! tmp_is_tmpfs; then
tmp_detail="tmp-not-tmpfs"
log "warning: /tmp is not tmpfs on ${NODE_NAME}; SD wear reduction is lower than expected"
fi
if [ "${fstab_changed}" = true ]; then
log "Astraios fstab refreshed for ${NODE_NAME}"
fi
if [ "${guard_changed}" = true ]; then
log "k3s-agent Astraios guard refreshed for ${NODE_NAME}"
fi
annotate_node "ready" "astraios-online-${free_gib}Gi-${tmp_detail}" "${selector_detail}"
log "Astraios ready on ${NODE_NAME} via ${selector_detail} mounted at ${USB_SCRATCH_MOUNTPOINT}"
}
main() {
while true; do
reconcile_once || true
if [ "${ONE_SHOT}" = "true" ]; then
exit 0
fi
sleep "${USB_SCRATCH_RECONCILE_INTERVAL_SEC}"
done
}
main

View File

@ -584,44 +584,6 @@
}
},
"timeFrom": "30d"
},
{
"id": 9,
"type": "timeseries",
"title": "Astraios Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 44
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
}
],
"time": {

View File

@ -1957,7 +1957,7 @@
{
"id": 47,
"type": "bargauge",
"title": "PVC Backup Health / Age",
"title": "Platform Suite Pass Rate (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1970,35 +1970,31 @@
},
"targets": [
{
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
"expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}",
"legendFormat": "{{suite}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "h",
"unit": "percent",
"min": 0,
"max": null,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"color": "red",
"value": null
},
{
"color": "yellow",
"value": 6
"value": 80
},
{
"color": "orange",
"value": 12
},
{
"color": "red",
"value": 24
"color": "green",
"value": 95
}
]
}
@ -2029,12 +2025,12 @@
],
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"title": "Open atlas-jobs dashboard",
"url": "/d/atlas-jobs",
"targetBlank": true
}
],
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
},
{
"id": 30,
@ -3176,7 +3172,7 @@
{
"id": 22,
"type": "bargauge",
"title": "Nodes Closest to Full Astraios Disks",
"title": "Nodes Closest to Full Root Disks",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -3189,7 +3185,7 @@
},
"targets": [
{
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A",
"legendFormat": "{{node}}"
}

View File

@ -593,44 +593,6 @@ data:
}
},
"timeFrom": "30d"
},
{
"id": 9,
"type": "timeseries",
"title": "Astraios Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 44
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
}
],
"time": {

View File

@ -1966,7 +1966,7 @@ data:
{
"id": 47,
"type": "bargauge",
"title": "PVC Backup Health / Age",
"title": "Platform Suite Pass Rate (24h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1979,35 +1979,31 @@ data:
},
"targets": [
{
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
"expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pvc}}",
"legendFormat": "{{suite}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "h",
"unit": "percent",
"min": 0,
"max": null,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"color": "red",
"value": null
},
{
"color": "yellow",
"value": 6
"value": 80
},
{
"color": "orange",
"value": 12
},
{
"color": "red",
"value": 24
"color": "green",
"value": 95
}
]
}
@ -2038,12 +2034,12 @@ data:
],
"links": [
{
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"title": "Open atlas-jobs dashboard",
"url": "/d/atlas-jobs",
"targetBlank": true
}
],
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
},
{
"id": 30,
@ -3185,7 +3181,7 @@ data:
{
"id": 22,
"type": "bargauge",
"title": "Nodes Closest to Full Astraios Disks",
"title": "Nodes Closest to Full Root Disks",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -3198,7 +3194,7 @@ data:
},
"targets": [
{
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A",
"legendFormat": "{{node}}"
}