diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 3cac538e..29eeb56f 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -35,6 +35,7 @@ data: PROM_DS = {"type": "prometheus", "uid": "atlas-vm"} PUBLIC_FOLDER = "overview" PRIVATE_FOLDER = "atlas-internal" +ASTRAIOS_MOUNTPOINT = "/mnt/astraios" PERCENT_THRESHOLDS = { "mode": "absolute", @@ -156,6 +157,10 @@ def root_usage_expr(scope=""): return filesystem_usage_expr("/", scope) +def astraios_usage_expr(scope=""): + return filesystem_usage_expr(ASTRAIOS_MOUNTPOINT, scope) + + def astreae_usage_expr(mount): return ( f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / " @@ -470,6 +475,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = ( f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))), 1)) ' f'and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}[24h]))) > 0))' ) +PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))" ANANKE_SELECTOR = 'job="ananke-power"' ANANKE_UPS_DB_NAME = "Pyrphoros" ANANKE_UPS_DB_NODE = "titan-db" @@ -1540,26 +1546,27 @@ def build_overview(): panels.append( bargauge_panel( 47, - "Platform Suite Pass Rate (24h)", - PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE, + "PVC Backup Health / Age", + PVC_BACKUP_AGE_HOURS_BY_PVC, {"h": 5, "w": 6, "x": 18, "y": 7}, - unit="percent", + unit="h", instant=True, - legend="{{suite}}", + legend="{{namespace}}/{{pvc}}", sort_order="desc", thresholds={ "mode": "absolute", "steps": [ - {"color": "red", "value": None}, - {"color": "yellow", "value": 80}, - {"color": "green", "value": 95}, + {"color": "green", "value": None}, + {"color": "yellow", "value": 6}, + {"color": "orange", "value": 12}, + {"color": "red", "value": 24}, ], }, ) ) - panels[-1]["links"] = link_to("atlas-jobs") + panels[-1]["links"] = link_to("atlas-storage") panels[-1]["description"] = ( - "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture." + "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published." ) panels.append( @@ -1847,8 +1854,8 @@ def build_overview(): panels.append( bargauge_panel( 22, - "Nodes Closest to Full Root Disks", - f"topk(12, {root_usage_expr()})", + "Nodes Closest to Full Astraios Disks", + f"topk(12, {astraios_usage_expr()})", {"h": 16, "w": 12, "x": 12, "y": 71}, unit="percent", thresholds=PERCENT_THRESHOLDS, @@ -2221,6 +2228,19 @@ def build_nodes_dashboard(): time_from="30d", ) ) + panels.append( + timeseries_panel( + 9, + "Astraios Usage", + astraios_usage_expr(), + {"h": 9, "w": 24, "x": 0, "y": 44}, + unit="percent", + legend="{{node}}", + legend_display="table", + legend_placement="right", + time_from="30d", + ) + ) return { "uid": "atlas-nodes", "title": "Atlas Nodes", diff --git a/services/maintenance/pi-usb-scratch-configmap.yaml b/services/maintenance/pi-usb-scratch-configmap.yaml index 18fc6712..bc216ead 100644 --- a/services/maintenance/pi-usb-scratch-configmap.yaml +++ b/services/maintenance/pi-usb-scratch-configmap.yaml @@ -7,29 +7,42 @@ metadata: data: usb_scratch.env: | USB_SCRATCH_DEFAULT_ENABLED=true - USB_SCRATCH_DEFAULT_LABEL=atlas-scratch + # Leave empty to avoid label-based fallback selection. + USB_SCRATCH_DEFAULT_LABEL= USB_SCRATCH_DEFAULT_FSTYPE=ext4 - USB_SCRATCH_MOUNTPOINT=/mnt/usb-scratch + USB_SCRATCH_MOUNTPOINT=/mnt/astraios + # Auto-select the removable 64GB USB partition on each worker. + USB_SCRATCH_AUTO_SELECT_REMOVABLE=true + USB_SCRATCH_AUTO_MIN_SIZE_GIB=50 + # One-time bootstrap for new sticks that ship exfat/fat32. + USB_SCRATCH_AUTO_FORMAT_REMOVABLE=true + USB_SCRATCH_AUTO_FORMAT_LABEL=astraios + # Keep /tmp in RAM to reduce SD-card writes. + USB_SCRATCH_ENFORCE_TMPFS_TMP=true USB_SCRATCH_REQUIRED_FREE_GIB=20 USB_SCRATCH_RECONCILE_INTERVAL_SEC=900 USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=900 usb_scratch_inventory.tsv: | # node_name enabled match_kind match_value fstype # match_kind: uuid | label | device - # Prefer UUID entries for the first rollout. A shared label works too if every Pi USB stick is formatted consistently. + # Astraios policy: + # - use UUID entries per worker node (preferred) + # - avoid shared labels to prevent accidental wrong-device mounts + # - mountpoint is /mnt/astraios on every worker node # Example: # titan-04 true uuid 11111111-2222-3333-4444-555555555555 ext4 - # titan-05 true label atlas-scratch ext4 - # titan-06 true label atlas-scratch ext4 - # titan-07 true label atlas-scratch ext4 - # titan-08 true label atlas-scratch ext4 - # titan-09 true label atlas-scratch ext4 - # titan-10 true label atlas-scratch ext4 - # titan-11 true label atlas-scratch ext4 - # titan-12 true label atlas-scratch ext4 - # titan-13 true label atlas-scratch ext4 - # titan-14 true label atlas-scratch ext4 - # titan-15 true label atlas-scratch ext4 - # titan-17 true label atlas-scratch ext4 - # titan-18 true label atlas-scratch ext4 - # titan-19 true label atlas-scratch ext4 + # titan-05 true uuid ext4 + # titan-06 true uuid ext4 + # titan-07 true uuid ext4 + # titan-08 true uuid ext4 + # titan-09 true uuid ext4 + # titan-10 true uuid ext4 + # titan-11 true uuid ext4 + # titan-12 true uuid ext4 + # titan-13 true uuid ext4 + # titan-14 true uuid ext4 + # titan-15 true uuid ext4 + # titan-16 true uuid ext4 + # titan-17 true uuid ext4 + # titan-18 true uuid ext4 + # titan-19 true uuid ext4 diff --git a/services/maintenance/scripts/pi_usb_scratch.sh b/services/maintenance/scripts/pi_usb_scratch.sh index fad30339..5efff529 100755 --- a/services/maintenance/scripts/pi_usb_scratch.sh +++ b/services/maintenance/scripts/pi_usb_scratch.sh @@ -13,9 +13,14 @@ ONE_SHOT=${ONE_SHOT:-false} DEFAULT_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" USB_SCRATCH_DEFAULT_ENABLED=${USB_SCRATCH_DEFAULT_ENABLED:-true} -USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-atlas-scratch} +USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-} USB_SCRATCH_DEFAULT_FSTYPE=${USB_SCRATCH_DEFAULT_FSTYPE:-ext4} -USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/usb-scratch} +USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/astraios} +USB_SCRATCH_ENFORCE_TMPFS_TMP=${USB_SCRATCH_ENFORCE_TMPFS_TMP:-true} +USB_SCRATCH_AUTO_SELECT_REMOVABLE=${USB_SCRATCH_AUTO_SELECT_REMOVABLE:-true} +USB_SCRATCH_AUTO_MIN_SIZE_GIB=${USB_SCRATCH_AUTO_MIN_SIZE_GIB:-50} +USB_SCRATCH_AUTO_FORMAT_REMOVABLE=${USB_SCRATCH_AUTO_FORMAT_REMOVABLE:-true} +USB_SCRATCH_AUTO_FORMAT_LABEL=${USB_SCRATCH_AUTO_FORMAT_LABEL:-astraios} USB_SCRATCH_REQUIRED_FREE_GIB=${USB_SCRATCH_REQUIRED_FREE_GIB:-20} USB_SCRATCH_RECONCILE_INTERVAL_SEC=${USB_SCRATCH_RECONCILE_INTERVAL_SEC:-900} USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC:-900} @@ -24,7 +29,9 @@ TARGET_PATHS=( "/var/log/pods" "/var/log/containers" "/var/lib/rancher/k3s/agent/containerd" + "/var/lib/rancher/k3s/agent/kubelet" "/var/lib/rancher/k3s/agent/images" + "/var/tmp" ) agent_stopped=0 @@ -44,6 +51,12 @@ annotate_node() { local timestamp timestamp="$(date -u +%FT%TZ)" kubectl annotate --overwrite node "${NODE_NAME}" \ + maintenance.bstein.dev/astraios-status="$(sanitize_annotation_value "${status}")" \ + maintenance.bstein.dev/astraios-detail="$(sanitize_annotation_value "${detail}")" \ + maintenance.bstein.dev/astraios-selector="$(sanitize_annotation_value "${selector}")" \ + maintenance.bstein.dev/astraios-mountpoint="$(sanitize_annotation_value "${USB_SCRATCH_MOUNTPOINT}")" \ + maintenance.bstein.dev/astraios-managed-paths="$(sanitize_annotation_value "${TARGET_PATHS[*]}")" \ + maintenance.bstein.dev/astraios-last-apply="${timestamp}" \ maintenance.bstein.dev/usb-scratch-status="$(sanitize_annotation_value "${status}")" \ maintenance.bstein.dev/usb-scratch-detail="$(sanitize_annotation_value "${detail}")" \ maintenance.bstein.dev/usb-scratch-selector="$(sanitize_annotation_value "${selector}")" \ @@ -82,9 +95,14 @@ load_config() { fi USB_SCRATCH_DEFAULT_ENABLED=${USB_SCRATCH_DEFAULT_ENABLED:-true} - USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-atlas-scratch} + USB_SCRATCH_DEFAULT_LABEL=${USB_SCRATCH_DEFAULT_LABEL:-} USB_SCRATCH_DEFAULT_FSTYPE=${USB_SCRATCH_DEFAULT_FSTYPE:-ext4} - USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/usb-scratch} + USB_SCRATCH_MOUNTPOINT=${USB_SCRATCH_MOUNTPOINT:-/mnt/astraios} + USB_SCRATCH_ENFORCE_TMPFS_TMP=${USB_SCRATCH_ENFORCE_TMPFS_TMP:-true} + USB_SCRATCH_AUTO_SELECT_REMOVABLE=${USB_SCRATCH_AUTO_SELECT_REMOVABLE:-true} + USB_SCRATCH_AUTO_MIN_SIZE_GIB=${USB_SCRATCH_AUTO_MIN_SIZE_GIB:-50} + USB_SCRATCH_AUTO_FORMAT_REMOVABLE=${USB_SCRATCH_AUTO_FORMAT_REMOVABLE:-true} + USB_SCRATCH_AUTO_FORMAT_LABEL=${USB_SCRATCH_AUTO_FORMAT_LABEL:-astraios} USB_SCRATCH_REQUIRED_FREE_GIB=${USB_SCRATCH_REQUIRED_FREE_GIB:-20} USB_SCRATCH_RECONCILE_INTERVAL_SEC=${USB_SCRATCH_RECONCILE_INTERVAL_SEC:-900} USB_SCRATCH_CUTOVER_JITTER_MAX_SEC=${USB_SCRATCH_CUTOVER_JITTER_MAX_SEC:-900} @@ -117,15 +135,22 @@ ensure_fstab_block() { tmp_candidate="${STATE_DIR}/fstab.candidate" strip_managed_block "${FSTAB_PATH}" > "${tmp_base}" + if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then + awk '$1 ~ /^#/ || $2 != "/tmp" { print }' "${tmp_base}" > "${tmp_base}.tmpfs" + mv "${tmp_base}.tmpfs" "${tmp_base}" + fi cp "${tmp_base}" "${tmp_candidate}" { printf '%s\n' "${MANAGED_BEGIN}" - printf '%s %s %s defaults,nofail,noatime,lazytime,commit=60,x-systemd.device-timeout=15s,x-systemd.mount-timeout=30s 0 2\n' \ + printf '%s %s %s defaults,noatime,lazytime,commit=60,x-systemd.device-timeout=15s,x-systemd.mount-timeout=30s 0 2\n' \ "${selector}" "${USB_SCRATCH_MOUNTPOINT}" "${fstype}" + if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then + printf '%s\n' 'tmpfs /tmp tmpfs defaults,nosuid,nodev,mode=1777 0 0' + fi for target in "${TARGET_PATHS[@]}"; do bind_source="${USB_SCRATCH_MOUNTPOINT}${target}" - printf '%s %s none bind,nofail,x-systemd.requires-mounts-for=%s 0 0\n' \ + printf '%s %s none bind,x-systemd.requires-mounts-for=%s 0 0\n' \ "${bind_source}" "${target}" "${USB_SCRATCH_MOUNTPOINT}" done printf '%s\n' "${MANAGED_END}" @@ -140,11 +165,84 @@ ensure_fstab_block() { return 1 } +ensure_k3s_agent_guard() { + local dropin_dir dropin_file guard_dir guard_file target verify_cmd requires_mounts + local tmp_dropin + + dropin_dir="${HOST_ROOT}/etc/systemd/system/k3s-agent.service.d" + dropin_file="${dropin_dir}/20-astraios-guard.conf" + guard_dir="${HOST_ROOT}/usr/local/lib/maintenance" + guard_file="${guard_dir}/verify_astraios_mounts.sh" + tmp_dropin="${STATE_DIR}/k3s-agent-astraios-dropin.conf" + + mkdir -p "${dropin_dir}" "${guard_dir}" "${STATE_DIR}" + + cat > "${guard_file}" <> "${guard_file}" </dev/null || true) +if [[ "\${src}" != '${USB_SCRATCH_MOUNTPOINT}${target}' ]]; then + echo "astraios guard: ${target} is not bound to ${USB_SCRATCH_MOUNTPOINT}${target}" >&2 + exit 1 +fi +EOF + done + chmod 0755 "${guard_file}" + + requires_mounts="${USB_SCRATCH_MOUNTPOINT}" + for target in "${TARGET_PATHS[@]}"; do + requires_mounts="${requires_mounts} ${target}" + done + verify_cmd="${guard_file#${HOST_ROOT}}" + + cat > "${tmp_dropin}" </dev/null || true")" + [ "${fstype}" = "tmpfs" ] +} + +ensure_tmp_tmpfs_live() { + host_sh "mkdir -p /tmp; chmod 1777 /tmp; fstype=\$(findmnt -T /tmp -n -o FSTYPE 2>/dev/null || true); if [ \"\${fstype}\" != \"tmpfs\" ]; then mount /tmp 2>/dev/null || mount -t tmpfs -o defaults,nosuid,nodev,mode=1777 tmpfs /tmp; fi" +} + find_existing_mount_source() { local target="$1" host_sh "findmnt -T '${target}' -n -o SOURCE 2>/dev/null || true" } +auto_discover_removable_partition() { + local min_bytes + min_bytes=$(( USB_SCRATCH_AUTO_MIN_SIZE_GIB * 1024 * 1024 * 1024 )) + host_sh "lsblk -brnpo NAME,TYPE,SIZE,RM | awk '\$2==\"part\" && \$4==\"1\" && \$3>=${min_bytes} {print \$1; exit}'" +} + +format_device_ext4() { + local device="$1" + local label="$2" + host_sh "mountpoint=\$(findmnt -S '${device}' -n -o TARGET 2>/dev/null || true); if [ -n \"\${mountpoint}\" ]; then umount \"\${mountpoint}\"; fi; wipefs -a '${device}'; mkfs.ext4 -F -L '${label}' '${device}'" +} + resolve_selector() { local inventory_line enabled kind value fstype actual_device actual_fstype actual_uuid actual_label selector expected_fstype inventory_line="$(lookup_inventory)" @@ -158,6 +256,9 @@ resolve_selector() { elif [ -n "${USB_SCRATCH_DEFAULT_LABEL}" ]; then kind="label" value="${USB_SCRATCH_DEFAULT_LABEL}" + elif [ "${USB_SCRATCH_AUTO_SELECT_REMOVABLE}" = "true" ]; then + kind="auto" + value="removable-${USB_SCRATCH_AUTO_MIN_SIZE_GIB}Gi-plus" fi if [ "${enabled}" != "true" ]; then @@ -191,6 +292,10 @@ resolve_selector() { selector="${value}" actual_device="$(host_sh "if [ -b '${value}' ]; then printf '%s' '${value}'; fi")" ;; + auto) + actual_device="$(auto_discover_removable_partition)" + selector="${actual_device}" + ;; *) SELECTOR_KIND="invalid" SELECTOR_VALUE="${value}" @@ -204,13 +309,26 @@ resolve_selector() { actual_fstype="" actual_uuid="" actual_label="" + if [ "${kind}" = "auto" ] && [ -z "${actual_device}" ]; then + SELECTOR_KIND="missing" + SELECTOR_VALUE="${value}" + SELECTOR_SPEC="" + DEVICE_PATH="" + DEVICE_FSTYPE="${fstype}" + SELECTOR_MATCH_KIND="${kind}" + return 0 + fi if [ -n "${actual_device}" ]; then actual_fstype="$(host_sh "blkid -o value -s TYPE '${actual_device}' 2>/dev/null || true")" actual_uuid="$(host_sh "blkid -o value -s UUID '${actual_device}' 2>/dev/null || true")" actual_label="$(host_sh "blkid -o value -s LABEL '${actual_device}' 2>/dev/null || true")" fi + if [ "${kind}" = "auto" ] && [ -n "${actual_uuid}" ]; then + selector="UUID=${actual_uuid}" + fi expected_fstype="${fstype:-${USB_SCRATCH_DEFAULT_FSTYPE}}" + SELECTOR_MATCH_KIND="${kind}" if [ -n "${actual_fstype}" ] && [ -n "${expected_fstype}" ] && [ "${actual_fstype}" != "${expected_fstype}" ]; then SELECTOR_KIND="fs-mismatch" SELECTOR_VALUE="${selector}" @@ -298,7 +416,7 @@ perform_cutover() { sleep "${jitter}" fi - log "stopping k3s-agent for USB scratch cutover" + log "stopping k3s-agent for Astraios cutover" host_sh "systemctl stop k3s-agent" agent_stopped=1 @@ -311,13 +429,13 @@ perform_cutover() { fi done - log "starting k3s-agent after USB scratch cutover" + log "starting k3s-agent after Astraios cutover" host_sh "systemctl start k3s-agent" agent_stopped=0 } reconcile_once() { - local fstab_changed=false free_gib selector_detail + local fstab_changed=false guard_changed=false free_gib selector_detail tmp_detail load_config resolve_selector @@ -326,7 +444,7 @@ reconcile_once() { case "${SELECTOR_KIND}" in disabled) annotate_node "disabled" "inventory-disabled" "none" - log "inventory disables USB scratch on ${NODE_NAME}" + log "inventory disables Astraios on ${NODE_NAME}" return 0 ;; missing) @@ -340,9 +458,17 @@ reconcile_once() { return 0 ;; fs-mismatch) - annotate_node "error" "filesystem-mismatch" "${SELECTOR_SPEC}" - log "filesystem mismatch on ${DEVICE_PATH}: expected ${USB_SCRATCH_DEFAULT_FSTYPE}, got ${DEVICE_FSTYPE}" - return 0 + if [ "${USB_SCRATCH_AUTO_FORMAT_REMOVABLE}" = "true" ] && [ "${SELECTOR_MATCH_KIND:-}" = "auto" ] && [ -n "${DEVICE_PATH}" ]; then + log "formatting auto-discovered device ${DEVICE_PATH} as ext4 label=${USB_SCRATCH_AUTO_FORMAT_LABEL}" + if format_device_ext4 "${DEVICE_PATH}" "${USB_SCRATCH_AUTO_FORMAT_LABEL}"; then + resolve_selector + fi + fi + if [ "${SELECTOR_KIND}" = "fs-mismatch" ]; then + annotate_node "error" "filesystem-mismatch" "${SELECTOR_SPEC}" + log "filesystem mismatch on ${DEVICE_PATH}: expected ${USB_SCRATCH_DEFAULT_FSTYPE}, got ${DEVICE_FSTYPE}" + return 0 + fi ;; esac @@ -355,8 +481,8 @@ reconcile_once() { if [ -z "${DEVICE_PATH}" ]; then annotate_node "pending" "device-not-found" "${selector_detail}" - log "scratch device not present yet for selector ${selector_detail}" - return 0 + log "Astraios device not present yet for selector ${selector_detail}" + return 0 fi if ! ensure_usb_mount_live; then @@ -372,10 +498,15 @@ reconcile_once() { if [ "${free_gib}" -lt "${USB_SCRATCH_REQUIRED_FREE_GIB}" ]; then annotate_node "error" "insufficient-free-space-${free_gib}Gi" "${selector_detail}" - log "usb scratch free space ${free_gib}Gi below required ${USB_SCRATCH_REQUIRED_FREE_GIB}Gi" + log "Astraios free space ${free_gib}Gi below required ${USB_SCRATCH_REQUIRED_FREE_GIB}Gi" return 0 fi + if ensure_k3s_agent_guard; then + guard_changed=true + host_sh "systemctl daemon-reload || true" + fi + if host_sh "systemctl list-unit-files | grep -q '^k3s-agent.service'"; then perform_cutover else @@ -389,11 +520,26 @@ reconcile_once() { return 0 fi - if [ "${fstab_changed}" = true ]; then - log "usb scratch fstab refreshed for ${NODE_NAME}" + tmp_detail="tmpfs-ok" + if [ "${USB_SCRATCH_ENFORCE_TMPFS_TMP}" = "true" ]; then + if ! ensure_tmp_tmpfs_live || ! tmp_is_tmpfs; then + annotate_node "error" "tmpfs-tmp-enforce-failed" "${selector_detail}" + log "failed to enforce /tmp tmpfs on ${NODE_NAME}" + return 0 + fi + elif ! tmp_is_tmpfs; then + tmp_detail="tmp-not-tmpfs" + log "warning: /tmp is not tmpfs on ${NODE_NAME}; SD wear reduction is lower than expected" fi - annotate_node "ready" "scratch-online-${free_gib}Gi" "${selector_detail}" - log "usb scratch ready on ${NODE_NAME} via ${selector_detail} mounted at ${USB_SCRATCH_MOUNTPOINT}" + + if [ "${fstab_changed}" = true ]; then + log "Astraios fstab refreshed for ${NODE_NAME}" + fi + if [ "${guard_changed}" = true ]; then + log "k3s-agent Astraios guard refreshed for ${NODE_NAME}" + fi + annotate_node "ready" "astraios-online-${free_gib}Gi-${tmp_detail}" "${selector_detail}" + log "Astraios ready on ${NODE_NAME} via ${selector_detail} mounted at ${USB_SCRATCH_MOUNTPOINT}" } main() { diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json index ea595792..e7d08229 100644 --- a/services/monitoring/dashboards/atlas-nodes.json +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -584,6 +584,44 @@ } }, "timeFrom": "30d" + }, + { + "id": 9, + "type": "timeseries", + "title": "Astraios Usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 44 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d" } ], "time": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 4ca00147..df56673b 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1957,7 +1957,7 @@ { "id": 47, "type": "bargauge", - "title": "Platform Suite Pass Rate (24h)", + "title": "PVC Backup Health / Age", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1970,31 +1970,35 @@ }, "targets": [ { - "expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))", + "expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))", "refId": "A", - "legendFormat": "{{suite}}", + "legendFormat": "{{namespace}}/{{pvc}}", "instant": true } ], "fieldConfig": { "defaults": { - "unit": "percent", + "unit": "h", "min": 0, - "max": 100, + "max": null, "thresholds": { "mode": "absolute", "steps": [ { - "color": "red", + "color": "green", "value": null }, { "color": "yellow", - "value": 80 + "value": 6 }, { - "color": "green", - "value": 95 + "color": "orange", + "value": 12 + }, + { + "color": "red", + "value": 24 } ] } @@ -2025,12 +2029,12 @@ ], "links": [ { - "title": "Open atlas-jobs dashboard", - "url": "/d/atlas-jobs", + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", "targetBlank": true } ], - "description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture." + "description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published." }, { "id": 30, @@ -3172,7 +3176,7 @@ { "id": 22, "type": "bargauge", - "title": "Nodes Closest to Full Root Disks", + "title": "Nodes Closest to Full Astraios Disks", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -3185,7 +3189,7 @@ }, "targets": [ { - "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", + "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", "refId": "A", "legendFormat": "{{node}}" } diff --git a/services/monitoring/grafana-dashboard-nodes.yaml b/services/monitoring/grafana-dashboard-nodes.yaml index 98123b96..0131c74e 100644 --- a/services/monitoring/grafana-dashboard-nodes.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -593,6 +593,44 @@ data: } }, "timeFrom": "30d" + }, + { + "id": 9, + "type": "timeseries", + "title": "Astraios Usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 44 + }, + "targets": [ + { + "expr": "avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d" } ], "time": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index d7eef6bc..01a77fcd 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1966,7 +1966,7 @@ data: { "id": 47, "type": "bargauge", - "title": "Platform Suite Pass Rate (24h)", + "title": "PVC Backup Health / Age", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1979,31 +1979,35 @@ data: }, "targets": [ { - "expr": "sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\",status=~\"ok|passed|success\"}[24h]))) / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))), 1)) and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total{suite=~\"ariadne|metis|ananke|atlasbot|lesavka|pegasus|soteria|titan-iac|bstein-home|arcanagon|data-prepper\"}[24h]))) > 0))", + "expr": "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours))", "refId": "A", - "legendFormat": "{{suite}}", + "legendFormat": "{{namespace}}/{{pvc}}", "instant": true } ], "fieldConfig": { "defaults": { - "unit": "percent", + "unit": "h", "min": 0, - "max": 100, + "max": null, "thresholds": { "mode": "absolute", "steps": [ { - "color": "red", + "color": "green", "value": null }, { "color": "yellow", - "value": 80 + "value": 6 }, { - "color": "green", - "value": 95 + "color": "orange", + "value": 12 + }, + { + "color": "red", + "value": 24 } ] } @@ -2034,12 +2038,12 @@ data: ], "links": [ { - "title": "Open atlas-jobs dashboard", - "url": "/d/atlas-jobs", + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", "targetBlank": true } ], - "description": "24-hour per-suite pass-rate snapshot. This complements the 7-day trend by showing each suite's current quality posture." + "description": "Oldest backup age in hours by PVC. This panel is reserved for the upcoming PVC backup health feed and will show no data until those metrics are published." }, { "id": 30, @@ -3181,7 +3185,7 @@ data: { "id": 22, "type": "bargauge", - "title": "Nodes Closest to Full Root Disks", + "title": "Nodes Closest to Full Astraios Disks", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -3194,7 +3198,7 @@ data: }, "targets": [ { - "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", + "expr": "sort_desc(topk(12, avg by (node) ((avg by (instance) ((1 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astraios\",fstype!~\"tmpfs|overlay\"})) * 100)) * on(instance) group_left(node) label_replace(node_uname_info{nodename!=\"\"}, \"node\", \"$1\", \"nodename\", \"(.*)\"))))", "refId": "A", "legendFormat": "{{node}}" }