maintenance: enforce Astraios + tmpfs /tmp on worker Pis

This commit is contained in:
Brad Stein 2026-04-11 11:54:43 -03:00
parent 5483c04bb3
commit 40de2b59a5
7 changed files with 339 additions and 76 deletions

View File

@ -35,6 +35,7 @@ data:
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
PUBLIC_FOLDER = "overview"
PRIVATE_FOLDER = "atlas-internal"
ASTRAIOS_MOUNTPOINT = "/mnt/astraios"
PERCENT_THRESHOLDS = {
"mode": "absolute",
@ -156,6 +157,10 @@ def root_usage_expr(scope=""):
return filesystem_usage_expr("/", scope)
def astraios_usage_expr(scope=""):
return filesystem_usage_expr(ASTRAIOS_MOUNTPOINT, scope)
def astreae_usage_expr(mount):
return (
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
@ -533,6 +538,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
)
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))"
ANANKE_SELECTOR = 'job="ananke-power"'
ANANKE_UPS_DB_NAME = "Pyrphoros"
ANANKE_UPS_DB_NODE = "titan-db"
@ -1601,26 +1607,27 @@ def build_overview():
panels.append(
bargauge_panel(
47,
"Platform Suite Pass Rate (24h)",
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE,
"PVC Backup Health / Age",
PVC_BACKUP_AGE_HOURS_BY_PVC,
{"h": 5, "w": 6, "x": 18, "y": 7},
unit="percent",
unit="h",
instant=True,
legend="{{suite}}",
legend="{{namespace}}/{{pvc}}",
sort_order="desc",
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "yellow", "value": 80},
{"color": "green", "value": 95},
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 12},
{"color": "red", "value": 24},
],
},
)
)
panels[-1]["links"] = link_to("atlas-jobs")
panels[-1]["links"] = link_to("atlas-storage")
panels[-1]["description"] = (
"24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
"Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
)
panels.append(
@ -1908,8 +1915,8 @@ def build_overview():
panels.append(
bargauge_panel(
22,
"Nodes Closest to Full Root Disks",
f"topk(12, {root_usage_expr()})",
"Nodes Closest to Full Astraios Disks",
f"topk(12, {astraios_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 71},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
@ -2282,6 +2289,19 @@ def build_nodes_dashboard():
time_from="30d",
)
)
panels.append(
timeseries_panel(
9,
"Astraios Usage",
astraios_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 44},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
return {
"uid": "atlas-nodes",
"title": "Atlas Nodes",

View File

@ -7,29 +7,42 @@ metadata:
data:
usb_scratch.env: |
USB_SCRATCH_DEFAULT_ENABLED=true
USB_SCRATCH_DEFAULT_LABEL=atlas-scratch
# Leave empty to avoid label-based fallback selection.
USB_SCRATCH_DEFAULT_LABEL=
USB_SCRATCH_DEFAULT_FSTYPE=ext4
USB_SCRATCH_MOUNTPOINT=/mnt/usb-scratch
USB_SCRATCH_MOUNTPOINT=/mnt/astraios
# Auto-select the removable 64GB USB partition on each worker.
USB_SCRATCH_AUTO_SELECT_REMOVABLE=true
USB_SCRATCH_AUTO_MIN_SIZE_GIB=50
# One-time bootstrap for new sticks that ship exfat/fat32.
USB_SCRATCH_AUTO_FORMAT_REMOVABLE=true
USB_SCRATCH_AUTO_FORMAT_LABEL=astraios
# Keep /tmp in RAM to reduce SD-card writes.
USB_SCRATCH_ENFORCE_TMPFS_TMP=true
USB_SCRATCH_REQUIRED_FREE_GIB=20
USB_SCRATCH_RECONCILE_INTERVAL_SEC=900
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=900
usb_scratch_inventory.tsv: |
# node_name enabled match_kind match_value fstype
# match_kind: uuid | label | device
# Prefer UUID entries for the first rollout. A shared label works too if every Pi USB stick is formatted consistently.
# Astraios policy:
# - use UUID entries per worker node (preferred)
# - avoid shared labels to prevent accidental wrong-device mounts
# - mountpoint is /mnt/astraios on every worker node
# Example:
# titan-04 true uuid 11111111-2222-3333-4444-555555555555 ext4
# titan-05 true label atlas-scratch ext4
# titan-06 true label atlas-scratch ext4
# titan-07 true label atlas-scratch ext4
# titan-08 true label atlas-scratch ext4
# titan-09 true label atlas-scratch ext4
# titan-10 true label atlas-scratch ext4
# titan-11 true label atlas-scratch ext4
# titan-12 true label atlas-scratch ext4
# titan-13 true label atlas-scratch ext4
# titan-14 true label atlas-scratch ext4
# titan-15 true label atlas-scratch ext4
# titan-17 true label atlas-scratch ext4
# titan-18 true label atlas-scratch ext4
# titan-19 true label atlas-scratch ext4
# titan-05 true uuid <uuid-for-titan-05-astraios> ext4
# titan-06 true uuid <uuid-for-titan-06-astraios> ext4
# titan-07 true uuid <uuid-for-titan-07-astraios> ext4
# titan-08 true uuid <uuid-for-titan-08-astraios> ext4
# titan-09 true uuid <uuid-for-titan-09-astraios> ext4
# titan-10 true uuid <uuid-for-titan-10-astraios> ext4
# titan-11 true uuid <uuid-for-titan-11-astraios> ext4
# titan-12 true uuid <uuid-for-titan-12-astraios> ext4
# titan-13 true uuid <uuid-for-titan-13-astraios> ext4
# titan-14 true uuid <uuid-for-titan-14-astraios> ext4
# titan-15 true uuid <uuid-for-titan-15-astraios> ext4
# titan-16 true uuid <uuid-for-titan-16-astraios> ext4
# titan-17 true uuid <uuid-for-titan-17-astraios> ext4
# titan-18 true uuid <uuid-for-titan-18-astraios> ext4
# titan-19 true uuid <uuid-for-titan-19-astraios> ext4

View File

@ -13,9 +13,14 @@ ONE_SHOT=${ONE_SHOT:-false}
DEFAULT_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
USB_SCRATCH_DEFAULT_ENABLED=${USB_SCRATCH_DEFAULT_ENABLED:-true}
USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-atlas-scratch}
USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-}
USB_SCRATCH_DEFAULT_FSTYPE=${USB_SCRATCH_DEFAULT_FSTYPE:-ext4}
USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/usb-scratch}
USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/astraios}
USB_SCRATCH_ENFORCE_TMPFS_TMP=${USB_SCRATCH_ENFORCE_TMPFS_TMP:-true}
USB_SCRATCH_AUTO_SELECT_REMOVABLE=${USB_SCRATCH_AUTO_SELECT_REMOVABLE:-true}
USB_SCRATCH_AUTO_MIN_SIZE_GIB=${USB_SCRATCH_AUTO_MIN_SIZE_GIB:-50}
USB_SCRATCH_AUTO_FORMAT_REMOVABLE=${USB_SCRATCH_AUTO_FORMAT_REMOVABLE:-true}
USB_SCRATCH_AUTO_FORMAT_LABEL=${USB_SCRATCH_AUTO_FORMAT_LABEL:-astraios}
USB_SCRATCH_REQUIRED_FREE_GIB=${USB_SCRATCH_REQUIRED_FREE_GIB:-20}
USB_SCRATCH_RECONCILE_INTERVAL_SEC=${USB_SCRATCH_RECONCILE_INTERVAL_SEC:-900}
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC:-900}
@ -24,7 +29,9 @@ TARGET_PATHS=(
"/var/log/pods"
"/var/log/containers"
"/var/lib/rancher/k3s/agent/containerd"
"/var/lib/rancher/k3s/agent/kubelet"
"/var/lib/rancher/k3s/agent/images"
"/var/tmp"
)
agent_stopped=0
@ -44,6 +51,12 @@ annotate_node() {
local timestamp
timestamp="$(date -u +%FT%TZ)"
kubectl annotate --overwrite node "${NODE_NAME}" \
maintenance.bstein.dev/astraios-status="$(sanitize_annotation_value "${status}")" \
maintenance.bstein.dev/astraios-detail="$(sanitize_annotation_value "${detail}")" \
maintenance.bstein.dev/astraios-selector="$(sanitize_annotation_value "${selector}")" \
maintenance.bstein.dev/astraios-mountpoint="$(sanitize_annotation_value "${USB_SCRATCH_MOUNTPOINT}")" \
maintenance.bstein.dev/astraios-managed-paths="$(sanitize_annotation_value "${TARGET_PATHS[*]}")" \
maintenance.bstein.dev/astraios-last-apply="${timestamp}" \
maintenance.bstein.dev/usb-scratch-status="$(sanitize_annotation_value "${status}")" \
maintenance.bstein.dev/usb-scratch-detail="$(sanitize_annotation_value "${detail}")" \
maintenance.bstein.dev/usb-scratch-selector="$(sanitize_annotation_value "${selector}")" \
@ -82,9 +95,14 @@ load_config() {
fi
USB_SCRATCH_DEFAULT_ENABLED=${USB_SCRATCH_DEFAULT_ENABLED:-true}
USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-atlas-scratch}
USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-}
USB_SCRATCH_DEFAULT_FSTYPE=${USB_SCRATCH_DEFAULT_FSTYPE:-ext4}
USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/usb-scratch}
USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/astraios}
USB_SCRATCH_ENFORCE_TMPFS_TMP=${USB_SCRATCH_ENFORCE_TMPFS_TMP:-true}
USB_SCRATCH_AUTO_SELECT_REMOVABLE=${USB_SCRATCH_AUTO_SELECT_REMOVABLE:-true}
USB_SCRATCH_AUTO_MIN_SIZE_GIB=${USB_SCRATCH_AUTO_MIN_SIZE_GIB:-50}
USB_SCRATCH_AUTO_FORMAT_REMOVABLE=${USB_SCRATCH_AUTO_FORMAT_REMOVABLE:-true}
USB_SCRATCH_AUTO_FORMAT_LABEL=${USB_SCRATCH_AUTO_FORMAT_LABEL:-astraios}
USB_SCRATCH_REQUIRED_FREE_GIB=${USB_SCRATCH_REQUIRED_FREE_GIB:-20}
USB_SCRATCH_RECONCILE_INTERVAL_SEC=${USB_SCRATCH_RECONCILE_INTERVAL_SEC:-900}
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC:-900}
@ -117,15 +135,22 @@ ensure_fstab_block() {
tmp_candidate="${STATE_DIR}/fstab.candidate"
strip_managed_block "${FSTAB_PATH}" > "${tmp_base}"
if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then
awk '$1 ~ /^#/ || $2 != "/tmp" { print }' "${tmp_base}" > "${tmp_base}.tmpfs"
mv "${tmp_base}.tmpfs" "${tmp_base}"
fi
cp "${tmp_base}" "${tmp_candidate}"
{
printf '%s\n' "${MANAGED_BEGIN}"
printf '%s %s %s defaults,nofail,noatime,lazytime,commit=60,x-systemd.device-timeout=15s,x-systemd.mount-timeout=30s 0 2\n' \
printf '%s %s %s defaults,noatime,lazytime,commit=60,x-systemd.device-timeout=15s,x-systemd.mount-timeout=30s 0 2\n' \
"${selector}" "${USB_SCRATCH_MOUNTPOINT}" "${fstype}"
if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then
printf '%s\n' 'tmpfs /tmp tmpfs defaults,nosuid,nodev,mode=1777 0 0'
fi
for target in "${TARGET_PATHS[@]}"; do
bind_source="${USB_SCRATCH_MOUNTPOINT}${target}"
printf '%s %s none bind,nofail,x-systemd.requires-mounts-for=%s 0 0\n' \
printf '%s %s none bind,x-systemd.requires-mounts-for=%s 0 0\n' \
"${bind_source}" "${target}" "${USB_SCRATCH_MOUNTPOINT}"
done
printf '%s\n' "${MANAGED_END}"
@ -140,11 +165,84 @@ ensure_fstab_block() {
return 1
}
ensure_k3s_agent_guard() {
local dropin_dir dropin_file guard_dir guard_file target verify_cmd requires_mounts
local tmp_dropin
dropin_dir="${HOST_ROOT}/etc/systemd/system/k3s-agent.service.d"
dropin_file="${dropin_dir}/20-astraios-guard.conf"
guard_dir="${HOST_ROOT}/usr/local/lib/maintenance"
guard_file="${guard_dir}/verify_astraios_mounts.sh"
tmp_dropin="${STATE_DIR}/k3s-agent-astraios-dropin.conf"
mkdir -p "${dropin_dir}" "${guard_dir}" "${STATE_DIR}"
cat > "${guard_file}" <<EOF
#!/usr/bin/env bash
set -euo pipefail
mountpoint -q '${USB_SCRATCH_MOUNTPOINT}'
EOF
for target in "${TARGET_PATHS[@]}"; do
cat >> "${guard_file}" <<EOF
src=\$(findmnt -T '${target}' -n -o SOURCE 2>/dev/null || true)
if [[ "\${src}" != '${USB_SCRATCH_MOUNTPOINT}${target}' ]]; then
echo "astraios guard: ${target} is not bound to ${USB_SCRATCH_MOUNTPOINT}${target}" >&2
exit 1
fi
EOF
done
chmod 0755 "${guard_file}"
requires_mounts="${USB_SCRATCH_MOUNTPOINT}"
for target in "${TARGET_PATHS[@]}"; do
requires_mounts="${requires_mounts} ${target}"
done
verify_cmd="${guard_file#${HOST_ROOT}}"
cat > "${tmp_dropin}" <<EOF
[Unit]
RequiresMountsFor=${requires_mounts}
After=local-fs.target
[Service]
ExecStartPre=${verify_cmd}
EOF
if [ ! -f "${dropin_file}" ] || ! cmp -s "${dropin_file}" "${tmp_dropin}"; then
cp "${tmp_dropin}" "${dropin_file}"
log "updated k3s-agent Astraios guard drop-in"
return 0
fi
return 1
}
tmp_is_tmpfs() {
local fstype
fstype="$(host_sh "findmnt -T /tmp -n -o FSTYPE 2>/dev/null || true")"
[ "${fstype}" = "tmpfs" ]
}
ensure_tmp_tmpfs_live() {
host_sh "mkdir -p /tmp; chmod 1777 /tmp; fstype=\$(findmnt -T /tmp -n -o FSTYPE 2>/dev/null || true); if [ \"\${fstype}\" != \"tmpfs\" ]; then mount /tmp 2>/dev/null || mount -t tmpfs -o defaults,nosuid,nodev,mode=1777 tmpfs /tmp; fi"
}
find_existing_mount_source() {
local target="$1"
host_sh "findmnt -T '${target}' -n -o SOURCE 2>/dev/null || true"
}
auto_discover_removable_partition() {
local min_bytes
min_bytes=$(( USB_SCRATCH_AUTO_MIN_SIZE_GIB * 1024 * 1024 * 1024 ))
host_sh "lsblk -brnpo NAME,TYPE,SIZE,RM | awk '\$2==\"part\" && \$4==\"1\" && \$3>=${min_bytes} {print \$1; exit}'"
}
format_device_ext4() {
local device="$1"
local label="$2"
host_sh "mountpoint=\$(findmnt -S '${device}' -n -o TARGET 2>/dev/null || true); if [ -n \"\${mountpoint}\" ]; then umount \"\${mountpoint}\"; fi; wipefs -a '${device}'; mkfs.ext4 -F -L '${label}' '${device}'"
}
resolve_selector() {
local inventory_line enabled kind value fstype actual_device actual_fstype actual_uuid actual_label selector expected_fstype
inventory_line="$(lookup_inventory)"
@ -158,6 +256,9 @@ resolve_selector() {
elif [ -n "${USB_SCRATCH_DEFAULT_LABEL}" ]; then
kind="label"
value="${USB_SCRATCH_DEFAULT_LABEL}"
elif [ "${USB_SCRATCH_AUTO_SELECT_REMOVABLE}" = "true" ]; then
kind="auto"
value="removable-${USB_SCRATCH_AUTO_MIN_SIZE_GIB}Gi-plus"
fi
if [ "${enabled}" != "true" ]; then
@ -191,6 +292,10 @@ resolve_selector() {
selector="${value}"
actual_device="$(host_sh "if [ -b '${value}' ]; then printf '%s' '${value}'; fi")"
;;
auto)
actual_device="$(auto_discover_removable_partition)"
selector="${actual_device}"
;;
*)
SELECTOR_KIND="invalid"
SELECTOR_VALUE="${value}"
@ -204,13 +309,26 @@ resolve_selector() {
actual_fstype=""
actual_uuid=""
actual_label=""
if [ "${kind}" = "auto" ] && [ -z "${actual_device}" ]; then
SELECTOR_KIND="missing"
SELECTOR_VALUE="${value}"
SELECTOR_SPEC=""
DEVICE_PATH=""
DEVICE_FSTYPE="${fstype}"
SELECTOR_MATCH_KIND="${kind}"
return 0
fi
if [ -n "${actual_device}" ]; then
actual_fstype="$(host_sh "blkid -o value -s TYPE '${actual_device}' 2>/dev/null || true")"
actual_uuid="$(host_sh "blkid -o value -s UUID '${actual_device}' 2>/dev/null || true")"
actual_label="$(host_sh "blkid -o value -s LABEL '${actual_device}' 2>/dev/null || true")"
fi
if [ "${kind}" = "auto" ] && [ -n "${actual_uuid}" ]; then
selector="UUID=${actual_uuid}"
fi
expected_fstype="${fstype:-${USB_SCRATCH_DEFAULT_FSTYPE}}"
SELECTOR_MATCH_KIND="${kind}"
if [ -n "${actual_fstype}" ] && [ -n "${expected_fstype}" ] && [ "${actual_fstype}" != "${expected_fstype}" ]; then
SELECTOR_KIND="fs-mismatch"
SELECTOR_VALUE="${selector}"
@ -298,7 +416,7 @@ perform_cutover() {
sleep "${jitter}"
fi
log "stopping k3s-agent for USB scratch cutover"
log "stopping k3s-agent for Astraios cutover"
host_sh "systemctl stop k3s-agent"
agent_stopped=1
@ -311,13 +429,13 @@ perform_cutover() {
fi
done
log "starting k3s-agent after USB scratch cutover"
log "starting k3s-agent after Astraios cutover"
host_sh "systemctl start k3s-agent"
agent_stopped=0
}
reconcile_once() {
local fstab_changed=false free_gib selector_detail
local fstab_changed=false guard_changed=false free_gib selector_detail tmp_detail
load_config
resolve_selector
@ -326,7 +444,7 @@ reconcile_once() {
case "${SELECTOR_KIND}" in
disabled)
annotate_node "disabled" "inventory-disabled" "none"
log "inventory disables USB scratch on ${NODE_NAME}"
log "inventory disables Astraios on ${NODE_NAME}"
return 0
;;
missing)
@ -340,9 +458,17 @@ reconcile_once() {
return 0
;;
fs-mismatch)
annotate_node "error" "filesystem-mismatch" "${SELECTOR_SPEC}"
log "filesystem mismatch on ${DEVICE_PATH}: expected ${USB_SCRATCH_DEFAULT_FSTYPE}, got ${DEVICE_FSTYPE}"
return 0
if [ "${USB_SCRATCH_AUTO_FORMAT_REMOVABLE}" = "true" ] && [ "${SELECTOR_MATCH_KIND:-}" = "auto" ] && [ -n "${DEVICE_PATH}" ]; then
log "formatting auto-discovered device ${DEVICE_PATH} as ext4 label=${USB_SCRATCH_AUTO_FORMAT_LABEL}"
if format_device_ext4 "${DEVICE_PATH}" "${USB_SCRATCH_AUTO_FORMAT_LABEL}"; then
resolve_selector
fi
fi
if [ "${SELECTOR_KIND}" = "fs-mismatch" ]; then
annotate_node "error" "filesystem-mismatch" "${SELECTOR_SPEC}"
log "filesystem mismatch on ${DEVICE_PATH}: expected ${USB_SCRATCH_DEFAULT_FSTYPE}, got ${DEVICE_FSTYPE}"
return 0
fi
;;
esac
@ -355,8 +481,8 @@ reconcile_once() {
if [ -z "${DEVICE_PATH}" ]; then
annotate_node "pending" "device-not-found" "${selector_detail}"
log "scratch device not present yet for selector ${selector_detail}"
return 0
log "Astraios device not present yet for selector ${selector_detail}"
return 0
fi
if ! ensure_usb_mount_live; then
@ -372,10 +498,15 @@ reconcile_once() {
if [ "${free_gib}" -lt "${USB_SCRATCH_REQUIRED_FREE_GIB}" ]; then
annotate_node "error" "insufficient-free-space-${free_gib}Gi" "${selector_detail}"
log "usb scratch free space ${free_gib}Gi below required ${USB_SCRATCH_REQUIRED_FREE_GIB}Gi"
log "Astraios free space ${free_gib}Gi below required ${USB_SCRATCH_REQUIRED_FREE_GIB}Gi"
return 0
fi
if ensure_k3s_agent_guard; then
guard_changed=true
host_sh "systemctl daemon-reload || true"
fi
if host_sh "systemctl list-unit-files | grep -q '^k3s-agent.service'"; then
perform_cutover
else
@ -389,11 +520,26 @@ reconcile_once() {
return 0
fi
if [ "${fstab_changed}" = true ]; then
log "usb scratch fstab refreshed for ${NODE_NAME}"
tmp_detail="tmpfs-ok"
if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then
if ! ensure_tmp_tmpfs_live || ! tmp_is_tmpfs; then
annotate_node "error" "tmpfs-tmp-enforce-failed" "${selector_detail}"
log "failed to enforce /tmp tmpfs on ${NODE_NAME}"
return 0
fi
elif ! tmp_is_tmpfs; then
tmp_detail="tmp-not-tmpfs"
log "warning: /tmp is not tmpfs on ${NODE_NAME}; SD wear reduction is lower than expected"
fi
annotate_node "ready" "scratch-online-${free_gib}Gi" "${selector_detail}"
log "usb scratch ready on ${NODE_NAME} via ${selector_detail} mounted at ${USB_SCRATCH_MOUNTPOINT}"
if [ "${fstab_changed}" = true ]; then
log "Astraios fstab refreshed for ${NODE_NAME}"
fi
if [ "${guard_changed}" = true ]; then
log "k3s-agent Astraios guard refreshed for ${NODE_NAME}"
fi
annotate_node "ready" "astraios-online-${free_gib}Gi-${tmp_detail}" "${selector_detail}"
log "Astraios ready on ${NODE_NAME} via ${selector_detail} mounted at ${USB_SCRATCH_MOUNTPOINT}"
}
main() {

View File

@ -584,6 +584,44 @@
}
},
"timeFrom": "30d"
},
{
"id": 9,
"type": "timeseries",
"title": "Astraios Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 44
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
}
],
"time": {

View File

@ -1957,7 +1957,7 @@
{
"id": 47,
"type": "bargauge",
"title": "Platform Suite Pass Rate (24h)",
"title": "PVC Backup Health / Age",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1970,31 +1970,35 @@
},
"targets": [
{
"expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
"refId": "A",
"legendFormat": "{{suite}}",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"unit": "h",
"min": 0,
"max": 100,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 80
"value": 6
},
{
"color": "green",
"value": 95
"color": "orange",
"value": 12
},
{
"color": "red",
"value": 24
}
]
}
@ -2025,12 +2029,12 @@
],
"links": [
{
"title": "Open atlas-jobs dashboard",
"url": "/d/atlas-jobs",
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
},
{
"id": 30,
@ -3172,7 +3176,7 @@
{
"id": 22,
"type": "bargauge",
"title": "Nodes Closest to Full Root Disks",
"title": "Nodes Closest to Full Astraios Disks",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -3185,7 +3189,7 @@
},
"targets": [
{
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A",
"legendFormat": "{{node}}"
}

View File

@ -593,6 +593,44 @@ data:
}
},
"timeFrom": "30d"
},
{
"id": 9,
"type": "timeseries",
"title": "Astraios Usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 44
},
"targets": [
{
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
}
],
"time": {

View File

@ -1966,7 +1966,7 @@ data:
{
"id": 47,
"type": "bargauge",
"title": "Platform Suite Pass Rate (24h)",
"title": "PVC Backup Health / Age",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -1979,31 +1979,35 @@ data:
},
"targets": [
{
"expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
"refId": "A",
"legendFormat": "{{suite}}",
"legendFormat": "{{namespace}}/{{pvc}}",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"unit": "h",
"min": 0,
"max": 100,
"max": null,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 80
"value": 6
},
{
"color": "green",
"value": 95
"color": "orange",
"value": 12
},
{
"color": "red",
"value": 24
}
]
}
@ -2034,12 +2038,12 @@ data:
],
"links": [
{
"title": "Open atlas-jobs dashboard",
"url": "/d/atlas-jobs",
"title": "Open atlas-storage dashboard",
"url": "/d/atlas-storage",
"targetBlank": true
}
],
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
},
{
"id": 30,
@ -3181,7 +3185,7 @@ data:
{
"id": 22,
"type": "bargauge",
"title": "Nodes Closest to Full Root Disks",
"title": "Nodes Closest to Full Astraios Disks",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -3194,7 +3198,7 @@ data:
},
"targets": [
{
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
"refId": "A",
"legendFormat": "{{node}}"
}