From c7edc81239839e00ff2ab463f0df5144b09a5915 Mon Sep 17 00:00:00 2001 From: jenkins Date: Fri, 22 May 2026 17:10:01 -0300 Subject: [PATCH] maintenance: stabilize recovered worker nodes --- .../core/node-prefer-noschedule-cronjob.yaml | 47 +++++++++++++++---- .../titan-22-link-keeper-daemonset.yaml | 36 +++++++------- 2 files changed, 55 insertions(+), 28 deletions(-) diff --git a/infrastructure/core/node-prefer-noschedule-cronjob.yaml b/infrastructure/core/node-prefer-noschedule-cronjob.yaml index dd4257df..2cad93d5 100644 --- a/infrastructure/core/node-prefer-noschedule-cronjob.yaml +++ b/infrastructure/core/node-prefer-noschedule-cronjob.yaml @@ -5,17 +5,17 @@ metadata: name: node-prefer-noschedule namespace: kube-system spec: - schedule: "*/20 * * * *" - concurrencyPolicy: Forbid + schedule: "* * * * *" + concurrencyPolicy: Replace successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 3 jobTemplate: spec: - backoffLimit: 1 + backoffLimit: 0 template: spec: serviceAccountName: node-prefer-noschedule - restartPolicy: OnFailure + restartPolicy: Never containers: - name: taint image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 @@ -24,11 +24,42 @@ spec: - bash - -ceu - | + k() { + kubectl --request-timeout=10s "$@" + } + + clear_worker() { + local node="${1}" + local hardware="${2}" + if k get node "${node}" >/dev/null 2>&1; then + k label node "${node}" node-role.kubernetes.io/worker=true "hardware=${hardware}" --overwrite=true || true + k label node "${node}" atlas.bstein.dev/spillover- || true + k taint node "${node}" node.kubernetes.io/unschedulable:NoSchedule- || true + k uncordon "${node}" || true + else + echo "skipping missing node ${node}" + fi + } + + clear_worker titan-04 rpi5 + clear_worker titan-05 rpi5 + clear_worker titan-07 rpi5 + clear_worker titan-08 rpi5 + clear_worker titan-11 rpi5 + clear_worker titan-12 rpi4 + clear_worker titan-14 rpi4 + clear_worker titan-18 rpi4 + clear_worker titan-22 amd64 + + if k get node titan-22 >/dev/null 2>&1; then + k label node titan-22 atlas.bstein.dev/general-compute=last-resort --overwrite=true || true + fi + for node in titan-13 titan-15 titan-17 titan-19; do - if kubectl get node "${node}" >/dev/null 2>&1; then - kubectl label node "${node}" atlas.bstein.dev/spillover=true --overwrite=true - kubectl taint node "${node}" longhorn=true:PreferNoSchedule --overwrite=true - kubectl taint node "${node}" atlas.bstein.dev/spillover=true:PreferNoSchedule --overwrite=true + if k get node "${node}" >/dev/null 2>&1; then + k label node "${node}" atlas.bstein.dev/spillover=true longhorn-host=true --overwrite=true || true + k taint node "${node}" longhorn=true:PreferNoSchedule --overwrite=true || true + k taint node "${node}" atlas.bstein.dev/spillover=true:PreferNoSchedule --overwrite=true || true else echo "skipping missing node ${node}" fi diff --git a/services/maintenance/titan-22-link-keeper-daemonset.yaml b/services/maintenance/titan-22-link-keeper-daemonset.yaml index 89c4370f..a0b88c95 100644 --- a/services/maintenance/titan-22-link-keeper-daemonset.yaml +++ b/services/maintenance/titan-22-link-keeper-daemonset.yaml @@ -59,32 +59,28 @@ spec: while true; do nsenter -t 1 -m -u -i -n -p -- /bin/sh -lc ' date -Is - if ethtool enp5s0 | grep -q "Link detected: yes"; then - ethtool enp5s0 | sed -n "1,25p" || true - ip -br addr show enp5s0 || true - exit 0 - fi - - echo "link down; attempting recovery" ip link set enp5s0 up || true ethtool --set-eee enp5s0 eee off || true - ethtool -s enp5s0 advertise 0x80000000002f autoneg on || ethtool -s enp5s0 autoneg on || true - sleep 5 + ethtool -s enp5s0 advertise 0x020 autoneg on || true + sleep 3 - if ! ethtool enp5s0 | grep -q "Link detected: yes"; then - echo "link still down after autoneg; trying forced 2.5G" - ethtool -s enp5s0 speed 2500 duplex full autoneg off || true - sleep 3 - fi - - if ! ethtool enp5s0 | grep -q "Link detected: yes"; then - echo "link still down after 2.5G; trying forced 1G" - ethtool -s enp5s0 speed 1000 duplex full autoneg off || true - sleep 3 + stable=true + for _ in 1 2 3 4 5; do + if ! ethtool enp5s0 | grep -q "Link detected: yes"; then + stable=false + fi + sleep 2 + done + + if [ "${stable}" != "true" ]; then + echo "link not stable at 1G-only; falling back to 100M-only autoneg" + ethtool -s enp5s0 advertise 0x008 autoneg on || true + sleep 5 fi + ethtool --show-eee enp5s0 || true ethtool enp5s0 | sed -n "1,45p" || true ip -br addr show enp5s0 || true ' - sleep 60 + sleep 30 done