Compare commits

...

2 Commits

11 changed files with 856 additions and 39 deletions

View File

@ -35,6 +35,7 @@ data:
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
PUBLIC_FOLDER = "overview"
PRIVATE_FOLDER = "atlas-internal"
ASTRAIOS_MOUNTPOINT = "/mnt/astraios"
PERCENT_THRESHOLDS = {
"mode": "absolute",
@ -156,6 +157,10 @@ def root_usage_expr(scope=""):
return filesystem_usage_expr("/", scope)
def astraios_usage_expr(scope=""):
return filesystem_usage_expr(ASTRAIOS_MOUNTPOINT, scope)
def astreae_usage_expr(mount):
return (
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
@ -470,6 +475,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
)
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))"
ANANKE_SELECTOR = 'job="ananke-power"'
ANANKE_UPS_DB_NAME = "Pyrphoros"
ANANKE_UPS_DB_NODE = "titan-db"
@ -1540,26 +1546,27 @@ def build_overview():
panels.append(
bargauge_panel(
47,
"Platform Suite Pass Rate (24h)",
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE,
"PVC Backup Health / Age",
PVC_BACKUP_AGE_HOURS_BY_PVC,
{"h": 5, "w": 6, "x": 18, "y": 7},
unit="percent",
unit="h",
instant=True,
legend="{{suite}}",
legend="{{namespace}}/{{pvc}}",
sort_order="desc",
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "yellow", "value": 80},
{"color": "green", "value": 95},
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 12},
{"color": "red", "value": 24},
],
},
)
)
panels[-1]["links"] = link_to("atlas-jobs")
panels[-1]["links"] = link_to("atlas-storage")
panels[-1]["description"] = (
"24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
"Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
)
panels.append(
@ -1847,8 +1854,8 @@ def build_overview():
panels.append(
bargauge_panel(
22,
"Nodes Closest to Full Root Disks",
f"topk(12, {root_usage_expr()})",
"Nodes Closest to Full Astraios Disks",
f"topk(12, {astraios_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 71},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
@ -2221,6 +2228,19 @@ def build_nodes_dashboard():
time_from="30d",
)
)
panels.append(
timeseries_panel(
9,
"Astraios Usage",
astraios_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 44},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
return {
"uid": "atlas-nodes",
"title": "Atlas Nodes",

View File

@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- pi-usb-scratch-configmap.yaml
- image.yaml
- secretproviderclass.yaml
- metis-configmap.yaml
@ -18,7 +19,9 @@ resources:
- metis-rbac.yaml
- metis-token-sync-serviceaccount.yaml
- node-nofile-serviceaccount.yaml
- pi-usb-scratch-serviceaccount.yaml
- pod-cleaner-rbac.yaml
- pi-usb-scratch-rbac.yaml
- ariadne-deployment.yaml
- metis-deployment.yaml
- oneoffs/ariadne-migrate-job.yaml
@ -26,6 +29,7 @@ resources:
- disable-k3s-traefik-daemonset.yaml
- oneoffs/k3s-traefik-cleanup-job.yaml
- node-nofile-daemonset.yaml
- pi-usb-scratch-daemonset.yaml
- metis-sentinel-amd64-daemonset.yaml
- metis-sentinel-arm64-daemonset.yaml
- metis-k3s-token-sync-cronjob.yaml
@ -74,3 +78,9 @@ configMapGenerator:
- node_image_sweeper.sh=scripts/node_image_sweeper.sh
options:
disableNameSuffixHash: true
- name: pi-usb-scratch-script
namespace: maintenance
files:
- pi_usb_scratch.sh=scripts/pi_usb_scratch.sh
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,48 @@
# services/maintenance/pi-usb-scratch-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: pi-usb-scratch-config
namespace: maintenance
data:
usb_scratch.env: |
USB_SCRATCH_DEFAULT_ENABLED=true
# Leave empty to avoid label-based fallback selection.
USB_SCRATCH_DEFAULT_LABEL=
USB_SCRATCH_DEFAULT_FSTYPE=ext4
USB_SCRATCH_MOUNTPOINT=/mnt/astraios
# Auto-select the removable 64GB USB partition on each worker.
USB_SCRATCH_AUTO_SELECT_REMOVABLE=true
USB_SCRATCH_AUTO_MIN_SIZE_GIB=50
# One-time bootstrap for new sticks that ship exfat/fat32.
USB_SCRATCH_AUTO_FORMAT_REMOVABLE=true
USB_SCRATCH_AUTO_FORMAT_LABEL=astraios
# Keep /tmp in RAM to reduce SD-card writes.
USB_SCRATCH_ENFORCE_TMPFS_TMP=true
USB_SCRATCH_REQUIRED_FREE_GIB=20
USB_SCRATCH_RECONCILE_INTERVAL_SEC=900
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=900
usb_scratch_inventory.tsv: |
# node_name enabled match_kind match_value fstype
# match_kind: uuid | label | device
# Astraios policy:
# - use UUID entries per worker node (preferred)
# - avoid shared labels to prevent accidental wrong-device mounts
# - mountpoint is /mnt/astraios on every worker node
# Example:
# titan-04 true uuid 11111111-2222-3333-4444-555555555555 ext4
# titan-05 true uuid <uuid-for-titan-05-astraios> ext4
# titan-06 true uuid <uuid-for-titan-06-astraios> ext4
# titan-07 true uuid <uuid-for-titan-07-astraios> ext4
# titan-08 true uuid <uuid-for-titan-08-astraios> ext4
# titan-09 true uuid <uuid-for-titan-09-astraios> ext4
# titan-10 true uuid <uuid-for-titan-10-astraios> ext4
# titan-11 true uuid <uuid-for-titan-11-astraios> ext4
# titan-12 true uuid <uuid-for-titan-12-astraios> ext4
# titan-13 true uuid <uuid-for-titan-13-astraios> ext4
# titan-14 true uuid <uuid-for-titan-14-astraios> ext4
# titan-15 true uuid <uuid-for-titan-15-astraios> ext4
# titan-16 true uuid <uuid-for-titan-16-astraios> ext4
# titan-17 true uuid <uuid-for-titan-17-astraios> ext4
# titan-18 true uuid <uuid-for-titan-18-astraios> ext4
# titan-19 true uuid <uuid-for-titan-19-astraios> ext4

View File

@ -0,0 +1,68 @@
# services/maintenance/pi-usb-scratch-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: pi-usb-scratch
namespace: maintenance
spec:
selector:
matchLabels:
app: pi-usb-scratch
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
app: pi-usb-scratch
spec:
serviceAccountName: pi-usb-scratch
hostPID: true
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi4
- rpi5
containers:
- name: pi-usb-scratch
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/usr/bin/env", "bash"]
args: ["/scripts/pi_usb_scratch.sh"]
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
privileged: true
runAsUser: 0
volumeMounts:
- name: host-root
mountPath: /host
- name: script
mountPath: /scripts
readOnly: true
- name: config
mountPath: /config
readOnly: true
volumes:
- name: host-root
hostPath:
path: /
- name: script
configMap:
name: pi-usb-scratch-script
defaultMode: 0555
- name: config
configMap:
name: pi-usb-scratch-config
defaultMode: 0444

View File

@ -0,0 +1,26 @@
# services/maintenance/pi-usb-scratch-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: pi-usb-scratch
rules:
- apiGroups: [""]
resources:
- nodes
verbs:
- get
- list
- patch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: pi-usb-scratch
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: pi-usb-scratch
subjects:
- kind: ServiceAccount
name: pi-usb-scratch
namespace: maintenance

View File

@ -0,0 +1,6 @@
# services/maintenance/pi-usb-scratch-serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: pi-usb-scratch
namespace: maintenance

View File

@ -0,0 +1,555 @@
#!/usr/bin/env bash
set -euo pipefail
NODE_NAME=${NODE_NAME:?NODE_NAME is required}
HOST_ROOT=${HOST_ROOT:-/host}
CONFIG_ENV=${CONFIG_ENV:-/config/usb_scratch.env}
INVENTORY_FILE=${INVENTORY_FILE:-/config/usb_scratch_inventory.tsv}
FSTAB_PATH="${HOST_ROOT}/etc/fstab"
STATE_DIR="${HOST_ROOT}/var/lib/maintenance/pi-usb-scratch"
MANAGED_BEGIN="# BEGIN maintenance.bstein.dev usb-scratch"
MANAGED_END="# END maintenance.bstein.dev usb-scratch"
ONE_SHOT=${ONE_SHOT:-false}
DEFAULT_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
USB_SCRATCH_DEFAULT_ENABLED=${USB_SCRATCH_DEFAULT_ENABLED:-true}
USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-}
USB_SCRATCH_DEFAULT_FSTYPE=${USB_SCRATCH_DEFAULT_FSTYPE:-ext4}
USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/astraios}
USB_SCRATCH_ENFORCE_TMPFS_TMP=${USB_SCRATCH_ENFORCE_TMPFS_TMP:-true}
USB_SCRATCH_AUTO_SELECT_REMOVABLE=${USB_SCRATCH_AUTO_SELECT_REMOVABLE:-true}
USB_SCRATCH_AUTO_MIN_SIZE_GIB=${USB_SCRATCH_AUTO_MIN_SIZE_GIB:-50}
USB_SCRATCH_AUTO_FORMAT_REMOVABLE=${USB_SCRATCH_AUTO_FORMAT_REMOVABLE:-true}
USB_SCRATCH_AUTO_FORMAT_LABEL=${USB_SCRATCH_AUTO_FORMAT_LABEL:-astraios}
USB_SCRATCH_REQUIRED_FREE_GIB=${USB_SCRATCH_REQUIRED_FREE_GIB:-20}
USB_SCRATCH_RECONCILE_INTERVAL_SEC=${USB_SCRATCH_RECONCILE_INTERVAL_SEC:-900}
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC:-900}
TARGET_PATHS=(
"/var/log/pods"
"/var/log/containers"
"/var/lib/rancher/k3s/agent/containerd"
"/var/lib/rancher/k3s/agent/kubelet"
"/var/lib/rancher/k3s/agent/images"
"/var/tmp"
)
agent_stopped=0
log() {
printf '[%s] %s\n' "$(date -u +%FT%TZ)" "$*"
}
sanitize_annotation_value() {
printf '%s' "$1" | tr ' ' '_' | tr -cd '[:alnum:]._:/=-'
}
annotate_node() {
local status="$1"
local detail="$2"
local selector="$3"
local timestamp
timestamp="$(date -u +%FT%TZ)"
kubectl annotate --overwrite node "${NODE_NAME}" \
maintenance.bstein.dev/astraios-status="$(sanitize_annotation_value "${status}")" \
maintenance.bstein.dev/astraios-detail="$(sanitize_annotation_value "${detail}")" \
maintenance.bstein.dev/astraios-selector="$(sanitize_annotation_value "${selector}")" \
maintenance.bstein.dev/astraios-mountpoint="$(sanitize_annotation_value "${USB_SCRATCH_MOUNTPOINT}")" \
maintenance.bstein.dev/astraios-managed-paths="$(sanitize_annotation_value "${TARGET_PATHS[*]}")" \
maintenance.bstein.dev/astraios-last-apply="${timestamp}" \
maintenance.bstein.dev/usb-scratch-status="$(sanitize_annotation_value "${status}")" \
maintenance.bstein.dev/usb-scratch-detail="$(sanitize_annotation_value "${detail}")" \
maintenance.bstein.dev/usb-scratch-selector="$(sanitize_annotation_value "${selector}")" \
maintenance.bstein.dev/usb-scratch-mountpoint="$(sanitize_annotation_value "${USB_SCRATCH_MOUNTPOINT}")" \
maintenance.bstein.dev/usb-scratch-managed-paths="$(sanitize_annotation_value "${TARGET_PATHS[*]}")" \
maintenance.bstein.dev/usb-scratch-last-apply="${timestamp}" \
>/dev/null 2>&1 || true
}
host_sh() {
local command="$1"
if command -v nsenter >/dev/null 2>&1; then
nsenter -t 1 -m -u -i -n -p -- /bin/sh -ceu "PATH=${DEFAULT_PATH}; ${command}"
elif [ -x "${HOST_ROOT}/usr/bin/nsenter" ]; then
"${HOST_ROOT}/usr/bin/nsenter" -t 1 -m -u -i -n -p -- /bin/sh -ceu "PATH=${DEFAULT_PATH}; ${command}"
elif [ -x "${HOST_ROOT}/bin/nsenter" ]; then
"${HOST_ROOT}/bin/nsenter" -t 1 -m -u -i -n -p -- /bin/sh -ceu "PATH=${DEFAULT_PATH}; ${command}"
else
chroot "${HOST_ROOT}" /bin/sh -ceu "PATH=${DEFAULT_PATH}; ${command}"
fi
}
cleanup() {
if [ "${agent_stopped}" -eq 1 ]; then
log "starting k3s-agent after interrupted cutover"
host_sh "systemctl start k3s-agent || true"
agent_stopped=0
fi
}
trap cleanup EXIT
load_config() {
if [ -f "${CONFIG_ENV}" ]; then
# shellcheck disable=SC1090
. "${CONFIG_ENV}"
fi
USB_SCRATCH_DEFAULT_ENABLED=${USB_SCRATCH_DEFAULT_ENABLED:-true}
USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-}
USB_SCRATCH_DEFAULT_FSTYPE=${USB_SCRATCH_DEFAULT_FSTYPE:-ext4}
USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/astraios}
USB_SCRATCH_ENFORCE_TMPFS_TMP=${USB_SCRATCH_ENFORCE_TMPFS_TMP:-true}
USB_SCRATCH_AUTO_SELECT_REMOVABLE=${USB_SCRATCH_AUTO_SELECT_REMOVABLE:-true}
USB_SCRATCH_AUTO_MIN_SIZE_GIB=${USB_SCRATCH_AUTO_MIN_SIZE_GIB:-50}
USB_SCRATCH_AUTO_FORMAT_REMOVABLE=${USB_SCRATCH_AUTO_FORMAT_REMOVABLE:-true}
USB_SCRATCH_AUTO_FORMAT_LABEL=${USB_SCRATCH_AUTO_FORMAT_LABEL:-astraios}
USB_SCRATCH_REQUIRED_FREE_GIB=${USB_SCRATCH_REQUIRED_FREE_GIB:-20}
USB_SCRATCH_RECONCILE_INTERVAL_SEC=${USB_SCRATCH_RECONCILE_INTERVAL_SEC:-900}
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC:-900}
}
lookup_inventory() {
local line=""
if [ -f "${INVENTORY_FILE}" ]; then
line="$(awk -v node="${NODE_NAME}" 'NF >= 4 && $1 !~ /^#/ && $1 == node { print; exit }' "${INVENTORY_FILE}" || true)"
fi
printf '%s' "${line}"
}
strip_managed_block() {
local source_file="$1"
awk -v begin="${MANAGED_BEGIN}" -v end="${MANAGED_END}" '
$0 == begin { skip=1; next }
$0 == end { skip=0; next }
skip != 1 { print }
' "${source_file}"
}
ensure_fstab_block() {
local selector="$1"
local fstype="$2"
local tmp_base tmp_candidate bind_source target
mkdir -p "${STATE_DIR}"
tmp_base="${STATE_DIR}/fstab.base"
tmp_candidate="${STATE_DIR}/fstab.candidate"
strip_managed_block "${FSTAB_PATH}" > "${tmp_base}"
if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then
awk '$1 ~ /^#/ || $2 != "/tmp" { print }' "${tmp_base}" > "${tmp_base}.tmpfs"
mv "${tmp_base}.tmpfs" "${tmp_base}"
fi
cp "${tmp_base}" "${tmp_candidate}"
{
printf '%s\n' "${MANAGED_BEGIN}"
printf '%s %s %s defaults,noatime,lazytime,commit=60,x-systemd.device-timeout=15s,x-systemd.mount-timeout=30s 0 2\n' \
"${selector}" "${USB_SCRATCH_MOUNTPOINT}" "${fstype}"
if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then
printf '%s\n' 'tmpfs /tmp tmpfs defaults,nosuid,nodev,mode=1777 0 0'
fi
for target in "${TARGET_PATHS[@]}"; do
bind_source="${USB_SCRATCH_MOUNTPOINT}${target}"
printf '%s %s none bind,x-systemd.requires-mounts-for=%s 0 0\n' \
"${bind_source}" "${target}" "${USB_SCRATCH_MOUNTPOINT}"
done
printf '%s\n' "${MANAGED_END}"
} >> "${tmp_candidate}"
if ! cmp -s "${FSTAB_PATH}" "${tmp_candidate}"; then
cp "${tmp_candidate}" "${FSTAB_PATH}"
log "updated ${FSTAB_PATH} managed block"
return 0
fi
return 1
}
ensure_k3s_agent_guard() {
local dropin_dir dropin_file guard_dir guard_file target verify_cmd requires_mounts
local tmp_dropin
dropin_dir="${HOST_ROOT}/etc/systemd/system/k3s-agent.service.d"
dropin_file="${dropin_dir}/20-astraios-guard.conf"
guard_dir="${HOST_ROOT}/usr/local/lib/maintenance"
guard_file="${guard_dir}/verify_astraios_mounts.sh"
tmp_dropin="${STATE_DIR}/k3s-agent-astraios-dropin.conf"
mkdir -p "${dropin_dir}" "${guard_dir}" "${STATE_DIR}"
cat > "${guard_file}" <<EOF
#!/usr/bin/env bash
set -euo pipefail
mountpoint -q '${USB_SCRATCH_MOUNTPOINT}'
EOF
for target in "${TARGET_PATHS[@]}"; do
cat >> "${guard_file}" <<EOF
src=\$(findmnt -T '${target}' -n -o SOURCE 2>/dev/null || true)
if [[ "\${src}" != '${USB_SCRATCH_MOUNTPOINT}${target}' ]]; then
echo "astraios guard: ${target} is not bound to ${USB_SCRATCH_MOUNTPOINT}${target}" >&2
exit 1
fi
EOF
done
chmod 0755 "${guard_file}"
requires_mounts="${USB_SCRATCH_MOUNTPOINT}"
for target in "${TARGET_PATHS[@]}"; do
requires_mounts="${requires_mounts} ${target}"
done
verify_cmd="${guard_file#${HOST_ROOT}}"
cat > "${tmp_dropin}" <<EOF
[Unit]
RequiresMountsFor=${requires_mounts}
After=local-fs.target
[Service]
ExecStartPre=${verify_cmd}
EOF
if [ ! -f "${dropin_file}" ] || ! cmp -s "${dropin_file}" "${tmp_dropin}"; then
cp "${tmp_dropin}" "${dropin_file}"
log "updated k3s-agent Astraios guard drop-in"
return 0
fi
return 1
}
tmp_is_tmpfs() {
local fstype
fstype="$(host_sh "findmnt -T /tmp -n -o FSTYPE 2>/dev/null || true")"
[ "${fstype}" = "tmpfs" ]
}
ensure_tmp_tmpfs_live() {
host_sh "mkdir -p /tmp; chmod 1777 /tmp; fstype=\$(findmnt -T /tmp -n -o FSTYPE 2>/dev/null || true); if [ \"\${fstype}\" != \"tmpfs\" ]; then mount /tmp 2>/dev/null || mount -t tmpfs -o defaults,nosuid,nodev,mode=1777 tmpfs /tmp; fi"
}
find_existing_mount_source() {
local target="$1"
host_sh "findmnt -T '${target}' -n -o SOURCE 2>/dev/null || true"
}
auto_discover_removable_partition() {
local min_bytes
min_bytes=$(( USB_SCRATCH_AUTO_MIN_SIZE_GIB * 1024 * 1024 * 1024 ))
host_sh "lsblk -brnpo NAME,TYPE,SIZE,RM | awk '\$2==\"part\" && \$4==\"1\" && \$3>=${min_bytes} {print \$1; exit}'"
}
format_device_ext4() {
local device="$1"
local label="$2"
host_sh "mountpoint=\$(findmnt -S '${device}' -n -o TARGET 2>/dev/null || true); if [ -n \"\${mountpoint}\" ]; then umount \"\${mountpoint}\"; fi; wipefs -a '${device}'; mkfs.ext4 -F -L '${label}' '${device}'"
}
resolve_selector() {
local inventory_line enabled kind value fstype actual_device actual_fstype actual_uuid actual_label selector expected_fstype
inventory_line="$(lookup_inventory)"
enabled="${USB_SCRATCH_DEFAULT_ENABLED}"
kind=""
value=""
fstype="${USB_SCRATCH_DEFAULT_FSTYPE}"
if [ -n "${inventory_line}" ]; then
read -r _ enabled kind value fstype _ <<<"${inventory_line}"
elif [ -n "${USB_SCRATCH_DEFAULT_LABEL}" ]; then
kind="label"
value="${USB_SCRATCH_DEFAULT_LABEL}"
elif [ "${USB_SCRATCH_AUTO_SELECT_REMOVABLE}" = "true" ]; then
kind="auto"
value="removable-${USB_SCRATCH_AUTO_MIN_SIZE_GIB}Gi-plus"
fi
if [ "${enabled}" != "true" ]; then
SELECTOR_KIND="disabled"
SELECTOR_VALUE=""
SELECTOR_SPEC=""
DEVICE_PATH=""
DEVICE_FSTYPE="${fstype}"
return 0
fi
if [ -z "${kind}" ] || [ -z "${value}" ]; then
SELECTOR_KIND="missing"
SELECTOR_VALUE=""
SELECTOR_SPEC=""
DEVICE_PATH=""
DEVICE_FSTYPE="${fstype}"
return 0
fi
case "${kind}" in
uuid)
selector="UUID=${value}"
actual_device="$(host_sh "blkid -U '${value}' 2>/dev/null || true")"
;;
label)
selector="LABEL=${value}"
actual_device="$(host_sh "blkid -L '${value}' 2>/dev/null || true")"
;;
device)
selector="${value}"
actual_device="$(host_sh "if [ -b '${value}' ]; then printf '%s' '${value}'; fi")"
;;
auto)
actual_device="$(auto_discover_removable_partition)"
selector="${actual_device}"
;;
*)
SELECTOR_KIND="invalid"
SELECTOR_VALUE="${value}"
SELECTOR_SPEC=""
DEVICE_PATH=""
DEVICE_FSTYPE="${fstype}"
return 0
;;
esac
actual_fstype=""
actual_uuid=""
actual_label=""
if [ "${kind}" = "auto" ] && [ -z "${actual_device}" ]; then
SELECTOR_KIND="missing"
SELECTOR_VALUE="${value}"
SELECTOR_SPEC=""
DEVICE_PATH=""
DEVICE_FSTYPE="${fstype}"
SELECTOR_MATCH_KIND="${kind}"
return 0
fi
if [ -n "${actual_device}" ]; then
actual_fstype="$(host_sh "blkid -o value -s TYPE '${actual_device}' 2>/dev/null || true")"
actual_uuid="$(host_sh "blkid -o value -s UUID '${actual_device}' 2>/dev/null || true")"
actual_label="$(host_sh "blkid -o value -s LABEL '${actual_device}' 2>/dev/null || true")"
fi
if [ "${kind}" = "auto" ] && [ -n "${actual_uuid}" ]; then
selector="UUID=${actual_uuid}"
fi
expected_fstype="${fstype:-${USB_SCRATCH_DEFAULT_FSTYPE}}"
SELECTOR_MATCH_KIND="${kind}"
if [ -n "${actual_fstype}" ] && [ -n "${expected_fstype}" ] && [ "${actual_fstype}" != "${expected_fstype}" ]; then
SELECTOR_KIND="fs-mismatch"
SELECTOR_VALUE="${selector}"
SELECTOR_SPEC="${selector}"
DEVICE_PATH="${actual_device}"
DEVICE_FSTYPE="${actual_fstype}"
return 0
fi
SELECTOR_KIND="${kind}"
SELECTOR_VALUE="${value}"
SELECTOR_SPEC="${selector}"
DEVICE_PATH="${actual_device}"
DEVICE_FSTYPE="${expected_fstype}"
DEVICE_UUID="${actual_uuid}"
DEVICE_LABEL="${actual_label}"
}
ensure_directories() {
local target source_dir
mkdir -p "${STATE_DIR}" "${HOST_ROOT}${USB_SCRATCH_MOUNTPOINT}"
for target in "${TARGET_PATHS[@]}"; do
mkdir -p "${HOST_ROOT}${target}" "${HOST_ROOT}${USB_SCRATCH_MOUNTPOINT}${target}"
done
}
ensure_usb_mount_live() {
local existing_source
existing_source="$(find_existing_mount_source "${USB_SCRATCH_MOUNTPOINT}")"
if [ -n "${existing_source}" ] && [ -n "${DEVICE_PATH}" ] && [ "${existing_source}" != "${DEVICE_PATH}" ] && [ "${existing_source}" != "${SELECTOR_SPEC}" ]; then
log "usb scratch already mounted from unexpected source ${existing_source}"
return 1
fi
host_sh "mkdir -p '${USB_SCRATCH_MOUNTPOINT}'; mountpoint -q '${USB_SCRATCH_MOUNTPOINT}' || mount '${USB_SCRATCH_MOUNTPOINT}'"
return 0
}
free_space_gib() {
host_sh "df -Pk '${USB_SCRATCH_MOUNTPOINT}' | awk 'NR==2 { printf \"%.0f\", \$4 / 1024 / 1024 }'"
}
target_bound_to_scratch() {
local target="$1"
local current_source
current_source="$(find_existing_mount_source "${target}")"
[ "${current_source}" = "${USB_SCRATCH_MOUNTPOINT}${target}" ]
}
seed_target_data() {
local target="$1"
local source_dir
source_dir="${USB_SCRATCH_MOUNTPOINT}${target}"
host_sh "mkdir -p '${source_dir}' '${target}'; if command -v rsync >/dev/null 2>&1; then rsync -aHAX --numeric-ids '${target}/' '${source_dir}/'; else tar -C '${target}' -cf - . | tar -C '${source_dir}' -xf -; fi"
}
mount_target_live() {
local target="$1"
host_sh "mountpoint -q '${target}' || mount '${target}'"
}
cutover_needed() {
local target
for target in "${TARGET_PATHS[@]}"; do
if ! target_bound_to_scratch "${target}"; then
return 0
fi
done
return 1
}
perform_cutover() {
local jitter target
if ! cutover_needed; then
return 0
fi
jitter=0
if [ "${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC}" -gt 0 ]; then
jitter=$(( RANDOM % (USB_SCRATCH_CUTOVER_JITTER_MAX_SEC + 1) ))
fi
if [ "${jitter}" -gt 0 ]; then
log "sleeping ${jitter}s before first live cutover"
sleep "${jitter}"
fi
log "stopping k3s-agent for Astraios cutover"
host_sh "systemctl stop k3s-agent"
agent_stopped=1
for target in "${TARGET_PATHS[@]}"; do
if ! target_bound_to_scratch "${target}"; then
log "seeding ${target} into ${USB_SCRATCH_MOUNTPOINT}${target}"
seed_target_data "${target}"
log "mounting bind target ${target}"
mount_target_live "${target}"
fi
done
log "starting k3s-agent after Astraios cutover"
host_sh "systemctl start k3s-agent"
agent_stopped=0
}
reconcile_once() {
local fstab_changed=false guard_changed=false free_gib selector_detail tmp_detail
load_config
resolve_selector
ensure_directories
case "${SELECTOR_KIND}" in
disabled)
annotate_node "disabled" "inventory-disabled" "none"
log "inventory disables Astraios on ${NODE_NAME}"
return 0
;;
missing)
annotate_node "pending" "missing-inventory" "none"
log "no inventory entry or default selector for ${NODE_NAME}"
return 0
;;
invalid)
annotate_node "error" "invalid-selector" "${SELECTOR_VALUE}"
log "invalid selector configured for ${NODE_NAME}"
return 0
;;
fs-mismatch)
if [ "${USB_SCRATCH_AUTO_FORMAT_REMOVABLE}" = "true" ] && [ "${SELECTOR_MATCH_KIND:-}" = "auto" ] && [ -n "${DEVICE_PATH}" ]; then
log "formatting auto-discovered device ${DEVICE_PATH} as ext4 label=${USB_SCRATCH_AUTO_FORMAT_LABEL}"
if format_device_ext4 "${DEVICE_PATH}" "${USB_SCRATCH_AUTO_FORMAT_LABEL}"; then
resolve_selector
fi
fi
if [ "${SELECTOR_KIND}" = "fs-mismatch" ]; then
annotate_node "error" "filesystem-mismatch" "${SELECTOR_SPEC}"
log "filesystem mismatch on ${DEVICE_PATH}: expected ${USB_SCRATCH_DEFAULT_FSTYPE}, got ${DEVICE_FSTYPE}"
return 0
fi
;;
esac
selector_detail="${SELECTOR_SPEC}"
if ensure_fstab_block "${SELECTOR_SPEC}" "${DEVICE_FSTYPE}"; then
fstab_changed=true
host_sh "systemctl daemon-reload || true"
fi
if [ -z "${DEVICE_PATH}" ]; then
annotate_node "pending" "device-not-found" "${selector_detail}"
log "Astraios device not present yet for selector ${selector_detail}"
return 0
fi
if ! ensure_usb_mount_live; then
annotate_node "error" "mount-conflict" "${selector_detail}"
return 0
fi
free_gib="$(free_space_gib || true)"
if [ -z "${free_gib}" ]; then
annotate_node "error" "free-space-check-failed" "${selector_detail}"
return 0
fi
if [ "${free_gib}" -lt "${USB_SCRATCH_REQUIRED_FREE_GIB}" ]; then
annotate_node "error" "insufficient-free-space-${free_gib}Gi" "${selector_detail}"
log "Astraios free space ${free_gib}Gi below required ${USB_SCRATCH_REQUIRED_FREE_GIB}Gi"
return 0
fi
if ensure_k3s_agent_guard; then
guard_changed=true
host_sh "systemctl daemon-reload || true"
fi
if host_sh "systemctl list-unit-files | grep -q '^k3s-agent.service'"; then
perform_cutover
else
annotate_node "error" "missing-k3s-agent-service" "${selector_detail}"
log "k3s-agent.service missing on ${NODE_NAME}"
return 0
fi
if cutover_needed; then
annotate_node "error" "bind-mount-incomplete" "${selector_detail}"
return 0
fi
tmp_detail="tmpfs-ok"
if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then
if ! ensure_tmp_tmpfs_live || ! tmp_is_tmpfs; then
annotate_node "error" "tmpfs-tmp-enforce-failed" "${selector_detail}"
log "failed to enforce /tmp tmpfs on ${NODE_NAME}"
return 0
fi
elif ! tmp_is_tmpfs; then
tmp_detail="tmp-not-tmpfs"
log "warning: /tmp is not tmpfs on ${NODE_NAME}; SD wear reduction is lower than expected"
fi
if [ "${fstab_changed}" = true ]; then
log "Astraios fstab refreshed for ${NODE_NAME}"
fi
if [ "${guard_changed}" = true ]; then
log "k3s-agent Astraios guard refreshed for ${NODE_NAME}"
fi
annotate_node "ready" "astraios-online-${free_gib}Gi-${tmp_detail}" "${selector_detail}"
log "Astraios ready on ${NODE_NAME} via ${selector_detail} mounted at ${USB_SCRATCH_MOUNTPOINT}"
}
main() {
while true; do
reconcile_once || true
if [ "${ONE_SHOT}" = "true" ]; then
exit 0
fi
sleep "${USB_SCRATCH_RECONCILE_INTERVAL_SEC}"
done
}
main

View File

@ -584,6 +584,44 @@
}
},
"timeFrom": "30d"
},
{
"id": 9,
"type": "timeseries",
"title": "Astraios Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 44
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
}
],
"time": {

View File

@ -1957,7 +1957,7 @@
{
"id": 47,
"type": "bargauge",
"title": "Platform Suite Pass Rate (24h)",
"title": "PVC Backup Health / Age",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1970,31 +1970,35 @@
},
"targets": [
{
"expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
"refId": "A",
"legendFormat": "{{suite}}",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"unit": "h",
"min": 0,
"max": 100,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 80
"value": 6
},
{
"color": "green",
"value": 95
"color": "orange",
"value": 12
},
{
"color": "red",
"value": 24
}
]
}
@ -2025,12 +2029,12 @@
],
"links": [
{
"title": "Open atlas-jobs dashboard",
"url": "/d/atlas-jobs",
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
},
{
"id": 30,
@ -3172,7 +3176,7 @@
{
"id": 22,
"type": "bargauge",
"title": "Nodes Closest to Full Root Disks",
"title": "Nodes Closest to Full Astraios Disks",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -3185,7 +3189,7 @@
},
"targets": [
{
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A",
"legendFormat": "{{node}}"
}

View File

@ -593,6 +593,44 @@ data:
}
},
"timeFrom": "30d"
},
{
"id": 9,
"type": "timeseries",
"title": "Astraios Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 44
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
}
],
"time": {

View File

@ -1966,7 +1966,7 @@ data:
{
"id": 47,
"type": "bargauge",
"title": "Platform Suite Pass Rate (24h)",
"title": "PVC Backup Health / Age",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1979,31 +1979,35 @@ data:
},
"targets": [
{
"expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
"refId": "A",
"legendFormat": "{{suite}}",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"unit": "h",
"min": 0,
"max": 100,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 80
"value": 6
},
{
"color": "green",
"value": 95
"color": "orange",
"value": 12
},
{
"color": "red",
"value": 24
}
]
}
@ -2034,12 +2038,12 @@ data:
],
"links": [
{
"title": "Open atlas-jobs dashboard",
"url": "/d/atlas-jobs",
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
},
{
"id": 30,
@ -3181,7 +3185,7 @@ data:
{
"id": 22,
"type": "bargauge",
"title": "Nodes Closest to Full Root Disks",
"title": "Nodes Closest to Full Astraios Disks",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -3194,7 +3198,7 @@ data:
},
"targets": [
{
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A",
"legendFormat": "{{node}}"
}