maintenance: enforce Astraios + tmpfs /tmp on worker Pis
This commit is contained in:
parent
5e39164fcd
commit
3ea296b552
@ -35,6 +35,7 @@ data:
|
|||||||
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
|
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
|
||||||
PUBLIC_FOLDER = "overview"
|
PUBLIC_FOLDER = "overview"
|
||||||
PRIVATE_FOLDER = "atlas-internal"
|
PRIVATE_FOLDER = "atlas-internal"
|
||||||
|
ASTRAIOS_MOUNTPOINT = "/mnt/astraios"
|
||||||
|
|
||||||
PERCENT_THRESHOLDS = {
|
PERCENT_THRESHOLDS = {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
@ -156,6 +157,10 @@ def root_usage_expr(scope=""):
|
|||||||
return filesystem_usage_expr("/", scope)
|
return filesystem_usage_expr("/", scope)
|
||||||
|
|
||||||
|
|
||||||
|
def astraios_usage_expr(scope=""):
|
||||||
|
return filesystem_usage_expr(ASTRAIOS_MOUNTPOINT, scope)
|
||||||
|
|
||||||
|
|
||||||
def astreae_usage_expr(mount):
|
def astreae_usage_expr(mount):
|
||||||
return (
|
return (
|
||||||
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
|
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
|
||||||
@ -470,6 +475,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
|
|||||||
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
|
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) '
|
||||||
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
|
f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))'
|
||||||
)
|
)
|
||||||
|
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))"
|
||||||
ANANKE_SELECTOR = 'job="ananke-power"'
|
ANANKE_SELECTOR = 'job="ananke-power"'
|
||||||
ANANKE_UPS_DB_NAME = "Pyrphoros"
|
ANANKE_UPS_DB_NAME = "Pyrphoros"
|
||||||
ANANKE_UPS_DB_NODE = "titan-db"
|
ANANKE_UPS_DB_NODE = "titan-db"
|
||||||
@ -1540,26 +1546,27 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
bargauge_panel(
|
bargauge_panel(
|
||||||
47,
|
47,
|
||||||
"Platform Suite Pass Rate (24h)",
|
"PVC Backup Health / Age",
|
||||||
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE,
|
PVC_BACKUP_AGE_HOURS_BY_PVC,
|
||||||
{"h": 5, "w": 6, "x": 18, "y": 7},
|
{"h": 5, "w": 6, "x": 18, "y": 7},
|
||||||
unit="percent",
|
unit="h",
|
||||||
instant=True,
|
instant=True,
|
||||||
legend="{{suite}}",
|
legend="{{namespace}}/{{pvc}}",
|
||||||
sort_order="desc",
|
sort_order="desc",
|
||||||
thresholds={
|
thresholds={
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
{"color": "red", "value": None},
|
{"color": "green", "value": None},
|
||||||
{"color": "yellow", "value": 80},
|
{"color": "yellow", "value": 6},
|
||||||
{"color": "green", "value": 95},
|
{"color": "orange", "value": 12},
|
||||||
|
{"color": "red", "value": 24},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
panels[-1]["links"] = link_to("atlas-jobs")
|
panels[-1]["links"] = link_to("atlas-storage")
|
||||||
panels[-1]["description"] = (
|
panels[-1]["description"] = (
|
||||||
"24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
|
"Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
|
||||||
)
|
)
|
||||||
|
|
||||||
panels.append(
|
panels.append(
|
||||||
@ -1847,8 +1854,8 @@ def build_overview():
|
|||||||
panels.append(
|
panels.append(
|
||||||
bargauge_panel(
|
bargauge_panel(
|
||||||
22,
|
22,
|
||||||
"Nodes Closest to Full Root Disks",
|
"Nodes Closest to Full Astraios Disks",
|
||||||
f"topk(12, {root_usage_expr()})",
|
f"topk(12, {astraios_usage_expr()})",
|
||||||
{"h": 16, "w": 12, "x": 12, "y": 71},
|
{"h": 16, "w": 12, "x": 12, "y": 71},
|
||||||
unit="percent",
|
unit="percent",
|
||||||
thresholds=PERCENT_THRESHOLDS,
|
thresholds=PERCENT_THRESHOLDS,
|
||||||
@ -2221,6 +2228,19 @@ def build_nodes_dashboard():
|
|||||||
time_from="30d",
|
time_from="30d",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
panels.append(
|
||||||
|
timeseries_panel(
|
||||||
|
9,
|
||||||
|
"Astraios Usage",
|
||||||
|
astraios_usage_expr(),
|
||||||
|
{"h": 9, "w": 24, "x": 0, "y": 44},
|
||||||
|
unit="percent",
|
||||||
|
legend="{{node}}",
|
||||||
|
legend_display="table",
|
||||||
|
legend_placement="right",
|
||||||
|
time_from="30d",
|
||||||
|
)
|
||||||
|
)
|
||||||
return {
|
return {
|
||||||
"uid": "atlas-nodes",
|
"uid": "atlas-nodes",
|
||||||
"title": "Atlas Nodes",
|
"title": "Atlas Nodes",
|
||||||
|
|||||||
@ -7,29 +7,42 @@ metadata:
|
|||||||
data:
|
data:
|
||||||
usb_scratch.env: |
|
usb_scratch.env: |
|
||||||
USB_SCRATCH_DEFAULT_ENABLED=true
|
USB_SCRATCH_DEFAULT_ENABLED=true
|
||||||
USB_SCRATCH_DEFAULT_LABEL=atlas-scratch
|
# Leave empty to avoid label-based fallback selection.
|
||||||
|
USB_SCRATCH_DEFAULT_LABEL=
|
||||||
USB_SCRATCH_DEFAULT_FSTYPE=ext4
|
USB_SCRATCH_DEFAULT_FSTYPE=ext4
|
||||||
USB_SCRATCH_MOUNTPOINT=/mnt/usb-scratch
|
USB_SCRATCH_MOUNTPOINT=/mnt/astraios
|
||||||
|
# Auto-select the removable 64GB USB partition on each worker.
|
||||||
|
USB_SCRATCH_AUTO_SELECT_REMOVABLE=true
|
||||||
|
USB_SCRATCH_AUTO_MIN_SIZE_GIB=50
|
||||||
|
# One-time bootstrap for new sticks that ship exfat/fat32.
|
||||||
|
USB_SCRATCH_AUTO_FORMAT_REMOVABLE=true
|
||||||
|
USB_SCRATCH_AUTO_FORMAT_LABEL=astraios
|
||||||
|
# Keep /tmp in RAM to reduce SD-card writes.
|
||||||
|
USB_SCRATCH_ENFORCE_TMPFS_TMP=true
|
||||||
USB_SCRATCH_REQUIRED_FREE_GIB=20
|
USB_SCRATCH_REQUIRED_FREE_GIB=20
|
||||||
USB_SCRATCH_RECONCILE_INTERVAL_SEC=900
|
USB_SCRATCH_RECONCILE_INTERVAL_SEC=900
|
||||||
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=900
|
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=900
|
||||||
usb_scratch_inventory.tsv: |
|
usb_scratch_inventory.tsv: |
|
||||||
# node_name enabled match_kind match_value fstype
|
# node_name enabled match_kind match_value fstype
|
||||||
# match_kind: uuid | label | device
|
# match_kind: uuid | label | device
|
||||||
# Prefer UUID entries for the first rollout. A shared label works too if every Pi USB stick is formatted consistently.
|
# Astraios policy:
|
||||||
|
# - use UUID entries per worker node (preferred)
|
||||||
|
# - avoid shared labels to prevent accidental wrong-device mounts
|
||||||
|
# - mountpoint is /mnt/astraios on every worker node
|
||||||
# Example:
|
# Example:
|
||||||
# titan-04 true uuid 11111111-2222-3333-4444-555555555555 ext4
|
# titan-04 true uuid 11111111-2222-3333-4444-555555555555 ext4
|
||||||
# titan-05 true label atlas-scratch ext4
|
# titan-05 true uuid <uuid-for-titan-05-astraios> ext4
|
||||||
# titan-06 true label atlas-scratch ext4
|
# titan-06 true uuid <uuid-for-titan-06-astraios> ext4
|
||||||
# titan-07 true label atlas-scratch ext4
|
# titan-07 true uuid <uuid-for-titan-07-astraios> ext4
|
||||||
# titan-08 true label atlas-scratch ext4
|
# titan-08 true uuid <uuid-for-titan-08-astraios> ext4
|
||||||
# titan-09 true label atlas-scratch ext4
|
# titan-09 true uuid <uuid-for-titan-09-astraios> ext4
|
||||||
# titan-10 true label atlas-scratch ext4
|
# titan-10 true uuid <uuid-for-titan-10-astraios> ext4
|
||||||
# titan-11 true label atlas-scratch ext4
|
# titan-11 true uuid <uuid-for-titan-11-astraios> ext4
|
||||||
# titan-12 true label atlas-scratch ext4
|
# titan-12 true uuid <uuid-for-titan-12-astraios> ext4
|
||||||
# titan-13 true label atlas-scratch ext4
|
# titan-13 true uuid <uuid-for-titan-13-astraios> ext4
|
||||||
# titan-14 true label atlas-scratch ext4
|
# titan-14 true uuid <uuid-for-titan-14-astraios> ext4
|
||||||
# titan-15 true label atlas-scratch ext4
|
# titan-15 true uuid <uuid-for-titan-15-astraios> ext4
|
||||||
# titan-17 true label atlas-scratch ext4
|
# titan-16 true uuid <uuid-for-titan-16-astraios> ext4
|
||||||
# titan-18 true label atlas-scratch ext4
|
# titan-17 true uuid <uuid-for-titan-17-astraios> ext4
|
||||||
# titan-19 true label atlas-scratch ext4
|
# titan-18 true uuid <uuid-for-titan-18-astraios> ext4
|
||||||
|
# titan-19 true uuid <uuid-for-titan-19-astraios> ext4
|
||||||
|
|||||||
@ -13,9 +13,14 @@ ONE_SHOT=${ONE_SHOT:-false}
|
|||||||
DEFAULT_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
DEFAULT_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||||
|
|
||||||
USB_SCRATCH_DEFAULT_ENABLED=${USB_SCRATCH_DEFAULT_ENABLED:-true}
|
USB_SCRATCH_DEFAULT_ENABLED=${USB_SCRATCH_DEFAULT_ENABLED:-true}
|
||||||
USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-atlas-scratch}
|
USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-}
|
||||||
USB_SCRATCH_DEFAULT_FSTYPE=${USB_SCRATCH_DEFAULT_FSTYPE:-ext4}
|
USB_SCRATCH_DEFAULT_FSTYPE=${USB_SCRATCH_DEFAULT_FSTYPE:-ext4}
|
||||||
USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/usb-scratch}
|
USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/astraios}
|
||||||
|
USB_SCRATCH_ENFORCE_TMPFS_TMP=${USB_SCRATCH_ENFORCE_TMPFS_TMP:-true}
|
||||||
|
USB_SCRATCH_AUTO_SELECT_REMOVABLE=${USB_SCRATCH_AUTO_SELECT_REMOVABLE:-true}
|
||||||
|
USB_SCRATCH_AUTO_MIN_SIZE_GIB=${USB_SCRATCH_AUTO_MIN_SIZE_GIB:-50}
|
||||||
|
USB_SCRATCH_AUTO_FORMAT_REMOVABLE=${USB_SCRATCH_AUTO_FORMAT_REMOVABLE:-true}
|
||||||
|
USB_SCRATCH_AUTO_FORMAT_LABEL=${USB_SCRATCH_AUTO_FORMAT_LABEL:-astraios}
|
||||||
USB_SCRATCH_REQUIRED_FREE_GIB=${USB_SCRATCH_REQUIRED_FREE_GIB:-20}
|
USB_SCRATCH_REQUIRED_FREE_GIB=${USB_SCRATCH_REQUIRED_FREE_GIB:-20}
|
||||||
USB_SCRATCH_RECONCILE_INTERVAL_SEC=${USB_SCRATCH_RECONCILE_INTERVAL_SEC:-900}
|
USB_SCRATCH_RECONCILE_INTERVAL_SEC=${USB_SCRATCH_RECONCILE_INTERVAL_SEC:-900}
|
||||||
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC:-900}
|
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC:-900}
|
||||||
@ -24,7 +29,9 @@ TARGET_PATHS=(
|
|||||||
"/var/log/pods"
|
"/var/log/pods"
|
||||||
"/var/log/containers"
|
"/var/log/containers"
|
||||||
"/var/lib/rancher/k3s/agent/containerd"
|
"/var/lib/rancher/k3s/agent/containerd"
|
||||||
|
"/var/lib/rancher/k3s/agent/kubelet"
|
||||||
"/var/lib/rancher/k3s/agent/images"
|
"/var/lib/rancher/k3s/agent/images"
|
||||||
|
"/var/tmp"
|
||||||
)
|
)
|
||||||
|
|
||||||
agent_stopped=0
|
agent_stopped=0
|
||||||
@ -44,6 +51,12 @@ annotate_node() {
|
|||||||
local timestamp
|
local timestamp
|
||||||
timestamp="$(date -u +%FT%TZ)"
|
timestamp="$(date -u +%FT%TZ)"
|
||||||
kubectl annotate --overwrite node "${NODE_NAME}" \
|
kubectl annotate --overwrite node "${NODE_NAME}" \
|
||||||
|
maintenance.bstein.dev/astraios-status="$(sanitize_annotation_value "${status}")" \
|
||||||
|
maintenance.bstein.dev/astraios-detail="$(sanitize_annotation_value "${detail}")" \
|
||||||
|
maintenance.bstein.dev/astraios-selector="$(sanitize_annotation_value "${selector}")" \
|
||||||
|
maintenance.bstein.dev/astraios-mountpoint="$(sanitize_annotation_value "${USB_SCRATCH_MOUNTPOINT}")" \
|
||||||
|
maintenance.bstein.dev/astraios-managed-paths="$(sanitize_annotation_value "${TARGET_PATHS[*]}")" \
|
||||||
|
maintenance.bstein.dev/astraios-last-apply="${timestamp}" \
|
||||||
maintenance.bstein.dev/usb-scratch-status="$(sanitize_annotation_value "${status}")" \
|
maintenance.bstein.dev/usb-scratch-status="$(sanitize_annotation_value "${status}")" \
|
||||||
maintenance.bstein.dev/usb-scratch-detail="$(sanitize_annotation_value "${detail}")" \
|
maintenance.bstein.dev/usb-scratch-detail="$(sanitize_annotation_value "${detail}")" \
|
||||||
maintenance.bstein.dev/usb-scratch-selector="$(sanitize_annotation_value "${selector}")" \
|
maintenance.bstein.dev/usb-scratch-selector="$(sanitize_annotation_value "${selector}")" \
|
||||||
@ -82,9 +95,14 @@ load_config() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
USB_SCRATCH_DEFAULT_ENABLED=${USB_SCRATCH_DEFAULT_ENABLED:-true}
|
USB_SCRATCH_DEFAULT_ENABLED=${USB_SCRATCH_DEFAULT_ENABLED:-true}
|
||||||
USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-atlas-scratch}
|
USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-}
|
||||||
USB_SCRATCH_DEFAULT_FSTYPE=${USB_SCRATCH_DEFAULT_FSTYPE:-ext4}
|
USB_SCRATCH_DEFAULT_FSTYPE=${USB_SCRATCH_DEFAULT_FSTYPE:-ext4}
|
||||||
USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/usb-scratch}
|
USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/astraios}
|
||||||
|
USB_SCRATCH_ENFORCE_TMPFS_TMP=${USB_SCRATCH_ENFORCE_TMPFS_TMP:-true}
|
||||||
|
USB_SCRATCH_AUTO_SELECT_REMOVABLE=${USB_SCRATCH_AUTO_SELECT_REMOVABLE:-true}
|
||||||
|
USB_SCRATCH_AUTO_MIN_SIZE_GIB=${USB_SCRATCH_AUTO_MIN_SIZE_GIB:-50}
|
||||||
|
USB_SCRATCH_AUTO_FORMAT_REMOVABLE=${USB_SCRATCH_AUTO_FORMAT_REMOVABLE:-true}
|
||||||
|
USB_SCRATCH_AUTO_FORMAT_LABEL=${USB_SCRATCH_AUTO_FORMAT_LABEL:-astraios}
|
||||||
USB_SCRATCH_REQUIRED_FREE_GIB=${USB_SCRATCH_REQUIRED_FREE_GIB:-20}
|
USB_SCRATCH_REQUIRED_FREE_GIB=${USB_SCRATCH_REQUIRED_FREE_GIB:-20}
|
||||||
USB_SCRATCH_RECONCILE_INTERVAL_SEC=${USB_SCRATCH_RECONCILE_INTERVAL_SEC:-900}
|
USB_SCRATCH_RECONCILE_INTERVAL_SEC=${USB_SCRATCH_RECONCILE_INTERVAL_SEC:-900}
|
||||||
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC:-900}
|
USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC:-900}
|
||||||
@ -117,15 +135,22 @@ ensure_fstab_block() {
|
|||||||
tmp_candidate="${STATE_DIR}/fstab.candidate"
|
tmp_candidate="${STATE_DIR}/fstab.candidate"
|
||||||
|
|
||||||
strip_managed_block "${FSTAB_PATH}" > "${tmp_base}"
|
strip_managed_block "${FSTAB_PATH}" > "${tmp_base}"
|
||||||
|
if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then
|
||||||
|
awk '$1 ~ /^#/ || $2 != "/tmp" { print }' "${tmp_base}" > "${tmp_base}.tmpfs"
|
||||||
|
mv "${tmp_base}.tmpfs" "${tmp_base}"
|
||||||
|
fi
|
||||||
cp "${tmp_base}" "${tmp_candidate}"
|
cp "${tmp_base}" "${tmp_candidate}"
|
||||||
|
|
||||||
{
|
{
|
||||||
printf '%s\n' "${MANAGED_BEGIN}"
|
printf '%s\n' "${MANAGED_BEGIN}"
|
||||||
printf '%s %s %s defaults,nofail,noatime,lazytime,commit=60,x-systemd.device-timeout=15s,x-systemd.mount-timeout=30s 0 2\n' \
|
printf '%s %s %s defaults,noatime,lazytime,commit=60,x-systemd.device-timeout=15s,x-systemd.mount-timeout=30s 0 2\n' \
|
||||||
"${selector}" "${USB_SCRATCH_MOUNTPOINT}" "${fstype}"
|
"${selector}" "${USB_SCRATCH_MOUNTPOINT}" "${fstype}"
|
||||||
|
if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then
|
||||||
|
printf '%s\n' 'tmpfs /tmp tmpfs defaults,nosuid,nodev,mode=1777 0 0'
|
||||||
|
fi
|
||||||
for target in "${TARGET_PATHS[@]}"; do
|
for target in "${TARGET_PATHS[@]}"; do
|
||||||
bind_source="${USB_SCRATCH_MOUNTPOINT}${target}"
|
bind_source="${USB_SCRATCH_MOUNTPOINT}${target}"
|
||||||
printf '%s %s none bind,nofail,x-systemd.requires-mounts-for=%s 0 0\n' \
|
printf '%s %s none bind,x-systemd.requires-mounts-for=%s 0 0\n' \
|
||||||
"${bind_source}" "${target}" "${USB_SCRATCH_MOUNTPOINT}"
|
"${bind_source}" "${target}" "${USB_SCRATCH_MOUNTPOINT}"
|
||||||
done
|
done
|
||||||
printf '%s\n' "${MANAGED_END}"
|
printf '%s\n' "${MANAGED_END}"
|
||||||
@ -140,11 +165,84 @@ ensure_fstab_block() {
|
|||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ensure_k3s_agent_guard() {
|
||||||
|
local dropin_dir dropin_file guard_dir guard_file target verify_cmd requires_mounts
|
||||||
|
local tmp_dropin
|
||||||
|
|
||||||
|
dropin_dir="${HOST_ROOT}/etc/systemd/system/k3s-agent.service.d"
|
||||||
|
dropin_file="${dropin_dir}/20-astraios-guard.conf"
|
||||||
|
guard_dir="${HOST_ROOT}/usr/local/lib/maintenance"
|
||||||
|
guard_file="${guard_dir}/verify_astraios_mounts.sh"
|
||||||
|
tmp_dropin="${STATE_DIR}/k3s-agent-astraios-dropin.conf"
|
||||||
|
|
||||||
|
mkdir -p "${dropin_dir}" "${guard_dir}" "${STATE_DIR}"
|
||||||
|
|
||||||
|
cat > "${guard_file}" <<EOF
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
mountpoint -q '${USB_SCRATCH_MOUNTPOINT}'
|
||||||
|
EOF
|
||||||
|
for target in "${TARGET_PATHS[@]}"; do
|
||||||
|
cat >> "${guard_file}" <<EOF
|
||||||
|
src=\$(findmnt -T '${target}' -n -o SOURCE 2>/dev/null || true)
|
||||||
|
if [[ "\${src}" != '${USB_SCRATCH_MOUNTPOINT}${target}' ]]; then
|
||||||
|
echo "astraios guard: ${target} is not bound to ${USB_SCRATCH_MOUNTPOINT}${target}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
EOF
|
||||||
|
done
|
||||||
|
chmod 0755 "${guard_file}"
|
||||||
|
|
||||||
|
requires_mounts="${USB_SCRATCH_MOUNTPOINT}"
|
||||||
|
for target in "${TARGET_PATHS[@]}"; do
|
||||||
|
requires_mounts="${requires_mounts} ${target}"
|
||||||
|
done
|
||||||
|
verify_cmd="${guard_file#${HOST_ROOT}}"
|
||||||
|
|
||||||
|
cat > "${tmp_dropin}" <<EOF
|
||||||
|
[Unit]
|
||||||
|
RequiresMountsFor=${requires_mounts}
|
||||||
|
After=local-fs.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStartPre=${verify_cmd}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
if [ ! -f "${dropin_file}" ] || ! cmp -s "${dropin_file}" "${tmp_dropin}"; then
|
||||||
|
cp "${tmp_dropin}" "${dropin_file}"
|
||||||
|
log "updated k3s-agent Astraios guard drop-in"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp_is_tmpfs() {
|
||||||
|
local fstype
|
||||||
|
fstype="$(host_sh "findmnt -T /tmp -n -o FSTYPE 2>/dev/null || true")"
|
||||||
|
[ "${fstype}" = "tmpfs" ]
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_tmp_tmpfs_live() {
|
||||||
|
host_sh "mkdir -p /tmp; chmod 1777 /tmp; fstype=\$(findmnt -T /tmp -n -o FSTYPE 2>/dev/null || true); if [ \"\${fstype}\" != \"tmpfs\" ]; then mount /tmp 2>/dev/null || mount -t tmpfs -o defaults,nosuid,nodev,mode=1777 tmpfs /tmp; fi"
|
||||||
|
}
|
||||||
|
|
||||||
find_existing_mount_source() {
|
find_existing_mount_source() {
|
||||||
local target="$1"
|
local target="$1"
|
||||||
host_sh "findmnt -T '${target}' -n -o SOURCE 2>/dev/null || true"
|
host_sh "findmnt -T '${target}' -n -o SOURCE 2>/dev/null || true"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto_discover_removable_partition() {
|
||||||
|
local min_bytes
|
||||||
|
min_bytes=$(( USB_SCRATCH_AUTO_MIN_SIZE_GIB * 1024 * 1024 * 1024 ))
|
||||||
|
host_sh "lsblk -brnpo NAME,TYPE,SIZE,RM | awk '\$2==\"part\" && \$4==\"1\" && \$3>=${min_bytes} {print \$1; exit}'"
|
||||||
|
}
|
||||||
|
|
||||||
|
format_device_ext4() {
|
||||||
|
local device="$1"
|
||||||
|
local label="$2"
|
||||||
|
host_sh "mountpoint=\$(findmnt -S '${device}' -n -o TARGET 2>/dev/null || true); if [ -n \"\${mountpoint}\" ]; then umount \"\${mountpoint}\"; fi; wipefs -a '${device}'; mkfs.ext4 -F -L '${label}' '${device}'"
|
||||||
|
}
|
||||||
|
|
||||||
resolve_selector() {
|
resolve_selector() {
|
||||||
local inventory_line enabled kind value fstype actual_device actual_fstype actual_uuid actual_label selector expected_fstype
|
local inventory_line enabled kind value fstype actual_device actual_fstype actual_uuid actual_label selector expected_fstype
|
||||||
inventory_line="$(lookup_inventory)"
|
inventory_line="$(lookup_inventory)"
|
||||||
@ -158,6 +256,9 @@ resolve_selector() {
|
|||||||
elif [ -n "${USB_SCRATCH_DEFAULT_LABEL}" ]; then
|
elif [ -n "${USB_SCRATCH_DEFAULT_LABEL}" ]; then
|
||||||
kind="label"
|
kind="label"
|
||||||
value="${USB_SCRATCH_DEFAULT_LABEL}"
|
value="${USB_SCRATCH_DEFAULT_LABEL}"
|
||||||
|
elif [ "${USB_SCRATCH_AUTO_SELECT_REMOVABLE}" = "true" ]; then
|
||||||
|
kind="auto"
|
||||||
|
value="removable-${USB_SCRATCH_AUTO_MIN_SIZE_GIB}Gi-plus"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "${enabled}" != "true" ]; then
|
if [ "${enabled}" != "true" ]; then
|
||||||
@ -191,6 +292,10 @@ resolve_selector() {
|
|||||||
selector="${value}"
|
selector="${value}"
|
||||||
actual_device="$(host_sh "if [ -b '${value}' ]; then printf '%s' '${value}'; fi")"
|
actual_device="$(host_sh "if [ -b '${value}' ]; then printf '%s' '${value}'; fi")"
|
||||||
;;
|
;;
|
||||||
|
auto)
|
||||||
|
actual_device="$(auto_discover_removable_partition)"
|
||||||
|
selector="${actual_device}"
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
SELECTOR_KIND="invalid"
|
SELECTOR_KIND="invalid"
|
||||||
SELECTOR_VALUE="${value}"
|
SELECTOR_VALUE="${value}"
|
||||||
@ -204,13 +309,26 @@ resolve_selector() {
|
|||||||
actual_fstype=""
|
actual_fstype=""
|
||||||
actual_uuid=""
|
actual_uuid=""
|
||||||
actual_label=""
|
actual_label=""
|
||||||
|
if [ "${kind}" = "auto" ] && [ -z "${actual_device}" ]; then
|
||||||
|
SELECTOR_KIND="missing"
|
||||||
|
SELECTOR_VALUE="${value}"
|
||||||
|
SELECTOR_SPEC=""
|
||||||
|
DEVICE_PATH=""
|
||||||
|
DEVICE_FSTYPE="${fstype}"
|
||||||
|
SELECTOR_MATCH_KIND="${kind}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
if [ -n "${actual_device}" ]; then
|
if [ -n "${actual_device}" ]; then
|
||||||
actual_fstype="$(host_sh "blkid -o value -s TYPE '${actual_device}' 2>/dev/null || true")"
|
actual_fstype="$(host_sh "blkid -o value -s TYPE '${actual_device}' 2>/dev/null || true")"
|
||||||
actual_uuid="$(host_sh "blkid -o value -s UUID '${actual_device}' 2>/dev/null || true")"
|
actual_uuid="$(host_sh "blkid -o value -s UUID '${actual_device}' 2>/dev/null || true")"
|
||||||
actual_label="$(host_sh "blkid -o value -s LABEL '${actual_device}' 2>/dev/null || true")"
|
actual_label="$(host_sh "blkid -o value -s LABEL '${actual_device}' 2>/dev/null || true")"
|
||||||
fi
|
fi
|
||||||
|
if [ "${kind}" = "auto" ] && [ -n "${actual_uuid}" ]; then
|
||||||
|
selector="UUID=${actual_uuid}"
|
||||||
|
fi
|
||||||
|
|
||||||
expected_fstype="${fstype:-${USB_SCRATCH_DEFAULT_FSTYPE}}"
|
expected_fstype="${fstype:-${USB_SCRATCH_DEFAULT_FSTYPE}}"
|
||||||
|
SELECTOR_MATCH_KIND="${kind}"
|
||||||
if [ -n "${actual_fstype}" ] && [ -n "${expected_fstype}" ] && [ "${actual_fstype}" != "${expected_fstype}" ]; then
|
if [ -n "${actual_fstype}" ] && [ -n "${expected_fstype}" ] && [ "${actual_fstype}" != "${expected_fstype}" ]; then
|
||||||
SELECTOR_KIND="fs-mismatch"
|
SELECTOR_KIND="fs-mismatch"
|
||||||
SELECTOR_VALUE="${selector}"
|
SELECTOR_VALUE="${selector}"
|
||||||
@ -298,7 +416,7 @@ perform_cutover() {
|
|||||||
sleep "${jitter}"
|
sleep "${jitter}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log "stopping k3s-agent for USB scratch cutover"
|
log "stopping k3s-agent for Astraios cutover"
|
||||||
host_sh "systemctl stop k3s-agent"
|
host_sh "systemctl stop k3s-agent"
|
||||||
agent_stopped=1
|
agent_stopped=1
|
||||||
|
|
||||||
@ -311,13 +429,13 @@ perform_cutover() {
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
log "starting k3s-agent after USB scratch cutover"
|
log "starting k3s-agent after Astraios cutover"
|
||||||
host_sh "systemctl start k3s-agent"
|
host_sh "systemctl start k3s-agent"
|
||||||
agent_stopped=0
|
agent_stopped=0
|
||||||
}
|
}
|
||||||
|
|
||||||
reconcile_once() {
|
reconcile_once() {
|
||||||
local fstab_changed=false free_gib selector_detail
|
local fstab_changed=false guard_changed=false free_gib selector_detail tmp_detail
|
||||||
|
|
||||||
load_config
|
load_config
|
||||||
resolve_selector
|
resolve_selector
|
||||||
@ -326,7 +444,7 @@ reconcile_once() {
|
|||||||
case "${SELECTOR_KIND}" in
|
case "${SELECTOR_KIND}" in
|
||||||
disabled)
|
disabled)
|
||||||
annotate_node "disabled" "inventory-disabled" "none"
|
annotate_node "disabled" "inventory-disabled" "none"
|
||||||
log "inventory disables USB scratch on ${NODE_NAME}"
|
log "inventory disables Astraios on ${NODE_NAME}"
|
||||||
return 0
|
return 0
|
||||||
;;
|
;;
|
||||||
missing)
|
missing)
|
||||||
@ -340,9 +458,17 @@ reconcile_once() {
|
|||||||
return 0
|
return 0
|
||||||
;;
|
;;
|
||||||
fs-mismatch)
|
fs-mismatch)
|
||||||
annotate_node "error" "filesystem-mismatch" "${SELECTOR_SPEC}"
|
if [ "${USB_SCRATCH_AUTO_FORMAT_REMOVABLE}" = "true" ] && [ "${SELECTOR_MATCH_KIND:-}" = "auto" ] && [ -n "${DEVICE_PATH}" ]; then
|
||||||
log "filesystem mismatch on ${DEVICE_PATH}: expected ${USB_SCRATCH_DEFAULT_FSTYPE}, got ${DEVICE_FSTYPE}"
|
log "formatting auto-discovered device ${DEVICE_PATH} as ext4 label=${USB_SCRATCH_AUTO_FORMAT_LABEL}"
|
||||||
return 0
|
if format_device_ext4 "${DEVICE_PATH}" "${USB_SCRATCH_AUTO_FORMAT_LABEL}"; then
|
||||||
|
resolve_selector
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if [ "${SELECTOR_KIND}" = "fs-mismatch" ]; then
|
||||||
|
annotate_node "error" "filesystem-mismatch" "${SELECTOR_SPEC}"
|
||||||
|
log "filesystem mismatch on ${DEVICE_PATH}: expected ${USB_SCRATCH_DEFAULT_FSTYPE}, got ${DEVICE_FSTYPE}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
@ -355,8 +481,8 @@ reconcile_once() {
|
|||||||
|
|
||||||
if [ -z "${DEVICE_PATH}" ]; then
|
if [ -z "${DEVICE_PATH}" ]; then
|
||||||
annotate_node "pending" "device-not-found" "${selector_detail}"
|
annotate_node "pending" "device-not-found" "${selector_detail}"
|
||||||
log "scratch device not present yet for selector ${selector_detail}"
|
log "Astraios device not present yet for selector ${selector_detail}"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if ! ensure_usb_mount_live; then
|
if ! ensure_usb_mount_live; then
|
||||||
@ -372,10 +498,15 @@ reconcile_once() {
|
|||||||
|
|
||||||
if [ "${free_gib}" -lt "${USB_SCRATCH_REQUIRED_FREE_GIB}" ]; then
|
if [ "${free_gib}" -lt "${USB_SCRATCH_REQUIRED_FREE_GIB}" ]; then
|
||||||
annotate_node "error" "insufficient-free-space-${free_gib}Gi" "${selector_detail}"
|
annotate_node "error" "insufficient-free-space-${free_gib}Gi" "${selector_detail}"
|
||||||
log "usb scratch free space ${free_gib}Gi below required ${USB_SCRATCH_REQUIRED_FREE_GIB}Gi"
|
log "Astraios free space ${free_gib}Gi below required ${USB_SCRATCH_REQUIRED_FREE_GIB}Gi"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if ensure_k3s_agent_guard; then
|
||||||
|
guard_changed=true
|
||||||
|
host_sh "systemctl daemon-reload || true"
|
||||||
|
fi
|
||||||
|
|
||||||
if host_sh "systemctl list-unit-files | grep -q '^k3s-agent.service'"; then
|
if host_sh "systemctl list-unit-files | grep -q '^k3s-agent.service'"; then
|
||||||
perform_cutover
|
perform_cutover
|
||||||
else
|
else
|
||||||
@ -389,11 +520,26 @@ reconcile_once() {
|
|||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "${fstab_changed}" = true ]; then
|
tmp_detail="tmpfs-ok"
|
||||||
log "usb scratch fstab refreshed for ${NODE_NAME}"
|
if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then
|
||||||
|
if ! ensure_tmp_tmpfs_live || ! tmp_is_tmpfs; then
|
||||||
|
annotate_node "error" "tmpfs-tmp-enforce-failed" "${selector_detail}"
|
||||||
|
log "failed to enforce /tmp tmpfs on ${NODE_NAME}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
elif ! tmp_is_tmpfs; then
|
||||||
|
tmp_detail="tmp-not-tmpfs"
|
||||||
|
log "warning: /tmp is not tmpfs on ${NODE_NAME}; SD wear reduction is lower than expected"
|
||||||
fi
|
fi
|
||||||
annotate_node "ready" "scratch-online-${free_gib}Gi" "${selector_detail}"
|
|
||||||
log "usb scratch ready on ${NODE_NAME} via ${selector_detail} mounted at ${USB_SCRATCH_MOUNTPOINT}"
|
if [ "${fstab_changed}" = true ]; then
|
||||||
|
log "Astraios fstab refreshed for ${NODE_NAME}"
|
||||||
|
fi
|
||||||
|
if [ "${guard_changed}" = true ]; then
|
||||||
|
log "k3s-agent Astraios guard refreshed for ${NODE_NAME}"
|
||||||
|
fi
|
||||||
|
annotate_node "ready" "astraios-online-${free_gib}Gi-${tmp_detail}" "${selector_detail}"
|
||||||
|
log "Astraios ready on ${NODE_NAME} via ${selector_detail} mounted at ${USB_SCRATCH_MOUNTPOINT}"
|
||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
|
|||||||
@ -584,6 +584,44 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"timeFrom": "30d"
|
"timeFrom": "30d"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Astraios Usage",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 9,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 44
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{node}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "right"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"timeFrom": "30d"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -1957,7 +1957,7 @@
|
|||||||
{
|
{
|
||||||
"id": 47,
|
"id": 47,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Platform Suite Pass Rate (24h)",
|
"title": "PVC Backup Health / Age",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1970,31 +1970,35 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
|
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{suite}}",
|
"legendFormat": "{{namespace}}/{{pvc}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"unit": "percent",
|
"unit": "h",
|
||||||
"min": 0,
|
"min": 0,
|
||||||
"max": 100,
|
"max": null,
|
||||||
"thresholds": {
|
"thresholds": {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "red",
|
"color": "green",
|
||||||
"value": null
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "yellow",
|
"color": "yellow",
|
||||||
"value": 80
|
"value": 6
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "green",
|
"color": "orange",
|
||||||
"value": 95
|
"value": 12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 24
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -2025,12 +2029,12 @@
|
|||||||
],
|
],
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Open atlas-jobs dashboard",
|
"title": "Open atlas-storage dashboard",
|
||||||
"url": "/d/atlas-jobs",
|
"url": "/d/atlas-storage",
|
||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
|
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 30,
|
"id": 30,
|
||||||
@ -3172,7 +3176,7 @@
|
|||||||
{
|
{
|
||||||
"id": 22,
|
"id": 22,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Nodes Closest to Full Root Disks",
|
"title": "Nodes Closest to Full Astraios Disks",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -3185,7 +3189,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
|
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}"
|
"legendFormat": "{{node}}"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -593,6 +593,44 @@ data:
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"timeFrom": "30d"
|
"timeFrom": "30d"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Astraios Usage",
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "atlas-vm"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 9,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 44
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))",
|
||||||
|
"refId": "A",
|
||||||
|
"legendFormat": "{{node}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "right"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"timeFrom": "30d"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -1966,7 +1966,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 47,
|
"id": 47,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Platform Suite Pass Rate (24h)",
|
"title": "PVC Backup Health / Age",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1979,31 +1979,35 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))",
|
"expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{suite}}",
|
"legendFormat": "{{namespace}}/{{pvc}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"fieldConfig": {
|
"fieldConfig": {
|
||||||
"defaults": {
|
"defaults": {
|
||||||
"unit": "percent",
|
"unit": "h",
|
||||||
"min": 0,
|
"min": 0,
|
||||||
"max": 100,
|
"max": null,
|
||||||
"thresholds": {
|
"thresholds": {
|
||||||
"mode": "absolute",
|
"mode": "absolute",
|
||||||
"steps": [
|
"steps": [
|
||||||
{
|
{
|
||||||
"color": "red",
|
"color": "green",
|
||||||
"value": null
|
"value": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "yellow",
|
"color": "yellow",
|
||||||
"value": 80
|
"value": 6
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"color": "green",
|
"color": "orange",
|
||||||
"value": 95
|
"value": 12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 24
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -2034,12 +2038,12 @@ data:
|
|||||||
],
|
],
|
||||||
"links": [
|
"links": [
|
||||||
{
|
{
|
||||||
"title": "Open atlas-jobs dashboard",
|
"title": "Open atlas-storage dashboard",
|
||||||
"url": "/d/atlas-jobs",
|
"url": "/d/atlas-storage",
|
||||||
"targetBlank": true
|
"targetBlank": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture."
|
"description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 30,
|
"id": 30,
|
||||||
@ -3181,7 +3185,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 22,
|
"id": 22,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Nodes Closest to Full Root Disks",
|
"title": "Nodes Closest to Full Astraios Disks",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -3194,7 +3198,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
|
"expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{node}}"
|
"legendFormat": "{{node}}"
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user