9 changed files with 63 additions and 125 deletions
--- a/scripts/dashboards_render_atlas.py
+++ b/scripts/dashboards_render_atlas.py
@ -423,17 +423,16 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
    "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
 )
 ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
-TEST_REPO_SELECTOR = 'repo=~"ariadne|metis"'
+ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
-TEST_CI_COVERAGE = f'ariadne_ci_coverage_percent{{{TEST_REPO_SELECTOR}}}'
+ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
-TEST_CI_TESTS = f'ariadne_ci_tests_total{{{TEST_REPO_SELECTOR}}}'
+ARIADNE_TEST_SUCCESS_RATE = (
 TEST_SUCCESS_RATE = (
    "100 * "
-    f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result="passed"}}[30d])) '
+    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
    "/ clamp_min("
-    f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"passed|failed|error"}}[30d])), 1)'
+    'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
 )
-TEST_FAILURES_24H = (
+ARIADNE_TEST_FAILURES_24H = (
-    f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))'
+    'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
 )
 POSTGRES_CONN_USED = (
    'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
@ -1298,8 +1297,8 @@ def build_overview():
    panels.append(
        timeseries_panel(
            42,
-            "Ariadne + Metis Test Success Rate",
+            "Ariadne Test Success Rate",
-            TEST_SUCCESS_RATE,
+            ARIADNE_TEST_SUCCESS_RATE,
            {"h": 6, "w": 6, "x": 12, "y": 14},
            unit="percent",
            max_value=100,
@ -1310,8 +1309,8 @@ def build_overview():
    panels.append(
        bargauge_panel(
            43,
-            "Ariadne + Metis Tests with Failures (24h)",
+            "Tests with Failures (24h)",
-            TEST_FAILURES_24H,
+            ARIADNE_TEST_FAILURES_24H,
            {"h": 6, "w": 6, "x": 18, "y": 14},
            unit="none",
            instant=True,
@ -2657,8 +2656,8 @@ def build_jobs_dashboard():
    panels.append(
        stat_panel(
            17,
-            "Ariadne + Metis CI Coverage (%)",
+            "Ariadne CI Coverage (%)",
-            TEST_CI_COVERAGE,
+            ARIADNE_CI_COVERAGE,
            {"h": 6, "w": 4, "x": 8, "y": 11},
            unit="percent",
            decimals=1,
@ -2669,8 +2668,8 @@ def build_jobs_dashboard():
    panels.append(
        table_panel(
            18,
-            "Ariadne + Metis CI Tests (latest)",
+            "Ariadne CI Tests (latest)",
-            TEST_CI_TESTS,
+            ARIADNE_CI_TESTS,
            {"h": 6, "w": 12, "x": 12, "y": 11},
            unit="none",
            transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
--- a/services/maintenance/node-image-sweeper-daemonset.yaml
+++ b/services/maintenance/node-image-sweeper-daemonset.yaml
@ -10,8 +10,6 @@ spec:
      app: node-image-sweeper
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 100%
  template:
    metadata:
      labels:
@ -31,21 +29,6 @@ spec:
        - name: node-image-sweeper
          image: python:3.12.9-alpine3.20
          command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
          env:
            - name: SWEEP_INTERVAL_SEC
              value: "21600"
            - name: HIGH_USAGE_PERCENT
              value: "70"
            - name: EMERGENCY_USAGE_PERCENT
              value: "80"
            - name: BASE_THRESHOLD_DAYS
              value: "14"
            - name: HIGH_USAGE_THRESHOLD_DAYS
              value: "3"
            - name: LOG_RETENTION_DAYS
              value: "7"
            - name: JOURNAL_MAX_SIZE
              value: "200M"
          securityContext:
            privileged: true
            runAsUser: 0
--- a/services/maintenance/scripts/node_image_sweeper.sh
+++ b/services/maintenance/scripts/node_image_sweeper.sh
@ -2,39 +2,26 @@
 set -eu
 ONE_SHOT=${ONE_SHOT:-false}
-SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
+THRESHOLD_DAYS=14
 BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
 HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
 HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
 EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
 LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
 JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
 SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
-sweep_once() {
+usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
-  usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
+if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
-  threshold_days="${BASE_THRESHOLD_DAYS}"
+  THRESHOLD_DAYS=3
-  if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
+fi
    threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
  fi
-  cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
+cutoff=$(python3 - <<'PY'
-import os
+import time, os
-import time
+print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
 days = int(os.environ.get("THRESHOLD_DAYS", "14"))
 print(int(time.time()) - days * 86400)
 PY
 )
-  RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
+RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
-  IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
+IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
-  prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
+SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
-import json
+
-import os
+prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
-import sys
+import json, os, sys, time
 import time
 try:
    data = json.load(sys.stdin)
@ -87,33 +74,19 @@ for p in prune:
 PY
 )
-  if [ -n "${prune_list}" ]; then
+if [ -n "${prune_list}" ]; then
-    printf "%s" "${prune_list}" | while read -r image_id; do
+  printf "%s" "${prune_list}" | while read -r image_id; do
-      if [ -n "${image_id}" ]; then
+    if [ -n "${image_id}" ]; then
-        chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
+      chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
-      fi
+    fi
-    done
+  done
-  fi
+fi
-  find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
+find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
-  find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
+find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
  if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
    # Emergency pass for rootfs pressure on SD-backed nodes.
    chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
    find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
    find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
    chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
  fi
 }
 sweep_once
 if [ "${ONE_SHOT}" = "true" ]; then
  exit 0
 fi
-while true; do
+sleep infinity
  sleep "${SWEEP_INTERVAL_SEC}"
  sweep_once
 done
--- a/services/monitoring/dashboards/atlas-jobs.json
+++ b/services/monitoring/dashboards/atlas-jobs.json
@ -1125,7 +1125,7 @@
    {
      "id": 17,
      "type": "stat",
-      "title": "Ariadne + Metis CI Coverage (%)",
+      "title": "Ariadne CI Coverage (%)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -1138,7 +1138,7 @@
      },
      "targets": [
        {
-          "expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
+          "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
          "refId": "A",
          "legendFormat": "{{branch}}",
          "instant": true
@ -1188,7 +1188,7 @@
    {
      "id": 18,
      "type": "table",
-      "title": "Ariadne + Metis CI Tests (latest)",
+      "title": "Ariadne CI Tests (latest)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -1201,7 +1201,7 @@
      },
      "targets": [
        {
-          "expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
+          "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
          "refId": "A",
          "instant": true
        }
--- a/services/monitoring/dashboards/atlas-overview.json
+++ b/services/monitoring/dashboards/atlas-overview.json
@ -1677,7 +1677,7 @@
    {
      "id": 42,
      "type": "timeseries",
-      "title": "Ariadne + Metis Test Success Rate",
+      "title": "Ariadne Test Success Rate",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -1690,7 +1690,7 @@
      },
      "targets": [
        {
-          "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
+          "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
          "refId": "A"
        }
      ],
@ -1714,7 +1714,7 @@
    {
      "id": 43,
      "type": "bargauge",
-      "title": "Ariadne + Metis Tests with Failures (24h)",
+      "title": "Tests with Failures (24h)",
      "datasource": {
        "type": "prometheus",
        "uid": "atlas-vm"
@ -1727,7 +1727,7 @@
      },
      "targets": [
        {
-          "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
+          "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
          "refId": "A",
          "legendFormat": "{{result}}",
          "instant": true
--- a/services/monitoring/grafana-alerting-config.yaml
+++ b/services/monitoring/grafana-alerting-config.yaml
@ -22,24 +22,7 @@ data:
      - orgId: 1
        receiver: email-admins
        group_by:
          - grafana_folder
          - alertname
        group_wait: 1m
        group_interval: 30m
        repeat_interval: 12h
        routes:
          - receiver: email-admins
            object_matchers:
              - [severity, "=", "critical"]
            group_wait: 30s
            group_interval: 5m
            repeat_interval: 2h
          - receiver: email-admins
            object_matchers:
              - [severity, "=", "warning"]
            group_wait: 5m
            group_interval: 2h
            repeat_interval: 24h
  rules.yaml: |
    apiVersion: 1
    groups:
@ -49,7 +32,7 @@ data:
        interval: 1m
        rules:
          - uid: disk-pressure-root
-            title: "Node rootfs high (>85%)"
+            title: "Node rootfs high (>80%)"
            condition: C
            for: "10m"
            data:
@ -83,7 +66,7 @@ data:
                  type: threshold
                  conditions:
                    - evaluator:
-                        params: [85]
+                        params: [80]
                        type: gt
                      operator:
                        type: and
@ -93,7 +76,7 @@ data:
            noDataState: NoData
            execErrState: Error
            annotations:
-              summary: "{{ $labels.node }} rootfs >85% for 10m"
+              summary: "{{ $labels.node }} rootfs >80% for 10m"
            labels:
              severity: warning
          - uid: disk-growth-1h
@ -518,7 +501,7 @@ data:
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
+                  expr: postmark_outbound_bounce_rate{window="1d"}
                  legendFormat: bounce 1d
                  datasource:
                    type: prometheus
@ -547,7 +530,7 @@ data:
                      reducer:
                        type: last
                      type: query
-            noDataState: OK
+            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "Postmark 1d bounce rate >5%"
@ -566,7 +549,7 @@ data:
                model:
                  intervalMs: 60000
                  maxDataPoints: 43200
-                  expr: max(postmark_api_up) or on() vector(0)
+                  expr: min_over_time(max by (instance) (postmark_api_up)[5m])
                  legendFormat: api up
                  datasource:
                    type: prometheus
@ -595,7 +578,7 @@ data:
                      reducer:
                        type: last
                      type: query
-            noDataState: OK
+            noDataState: NoData
            execErrState: Error
            annotations:
              summary: "Postmark exporter reports API down"
--- a/services/monitoring/grafana-dashboard-jobs.yaml
+++ b/services/monitoring/grafana-dashboard-jobs.yaml
@ -1134,7 +1134,7 @@ data:
        {
          "id": 17,
          "type": "stat",
-          "title": "Ariadne + Metis CI Coverage (%)",
+          "title": "Ariadne CI Coverage (%)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1147,7 +1147,7 @@ data:
          },
          "targets": [
            {
-              "expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
+              "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
              "refId": "A",
              "legendFormat": "{{branch}}",
              "instant": true
@ -1197,7 +1197,7 @@ data:
        {
          "id": 18,
          "type": "table",
-          "title": "Ariadne + Metis CI Tests (latest)",
+          "title": "Ariadne CI Tests (latest)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1210,7 +1210,7 @@ data:
          },
          "targets": [
            {
-              "expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
+              "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
              "refId": "A",
              "instant": true
            }
--- a/services/monitoring/grafana-dashboard-overview.yaml
+++ b/services/monitoring/grafana-dashboard-overview.yaml
@ -1686,7 +1686,7 @@ data:
        {
          "id": 42,
          "type": "timeseries",
-          "title": "Ariadne + Metis Test Success Rate",
+          "title": "Ariadne Test Success Rate",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1699,7 +1699,7 @@ data:
          },
          "targets": [
            {
-              "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
+              "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
              "refId": "A"
            }
          ],
@ -1723,7 +1723,7 @@ data:
        {
          "id": 43,
          "type": "bargauge",
-          "title": "Ariadne + Metis Tests with Failures (24h)",
+          "title": "Tests with Failures (24h)",
          "datasource": {
            "type": "prometheus",
            "uid": "atlas-vm"
@ -1736,7 +1736,7 @@ data:
          },
          "targets": [
            {
-              "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
+              "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
              "refId": "A",
              "legendFormat": "{{result}}",
              "instant": true
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@ -286,7 +286,7 @@ spec:
    podAnnotations:
      vault.hashicorp.com/agent-inject: "true"
      vault.hashicorp.com/role: "monitoring"
-      monitoring.bstein.dev/restart-rev: "6"
+      monitoring.bstein.dev/restart-rev: "4"
      vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
      vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
        {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}