maintenance: stabilize recovered worker nodes

This commit is contained in:
jenkins 2026-05-22 17:10:01 -03:00
parent 46c3e97688
commit c7edc81239
2 changed files with 55 additions and 28 deletions

View File

@ -5,17 +5,17 @@ metadata:
name: node-prefer-noschedule name: node-prefer-noschedule
namespace: kube-system namespace: kube-system
spec: spec:
schedule: "*/20 * * * *" schedule: "* * * * *"
concurrencyPolicy: Forbid concurrencyPolicy: Replace
successfulJobsHistoryLimit: 1 successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3 failedJobsHistoryLimit: 3
jobTemplate: jobTemplate:
spec: spec:
backoffLimit: 1 backoffLimit: 0
template: template:
spec: spec:
serviceAccountName: node-prefer-noschedule serviceAccountName: node-prefer-noschedule
restartPolicy: OnFailure restartPolicy: Never
containers: containers:
- name: taint - name: taint
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
@ -24,11 +24,42 @@ spec:
- bash - bash
- -ceu - -ceu
- | - |
k() {
kubectl --request-timeout=10s "$@"
}
clear_worker() {
local node="${1}"
local hardware="${2}"
if k get node "${node}" >/dev/null 2>&1; then
k label node "${node}" node-role.kubernetes.io/worker=true "hardware=${hardware}" --overwrite=true || true
k label node "${node}" atlas.bstein.dev/spillover- || true
k taint node "${node}" node.kubernetes.io/unschedulable:NoSchedule- || true
k uncordon "${node}" || true
else
echo "skipping missing node ${node}"
fi
}
clear_worker titan-04 rpi5
clear_worker titan-05 rpi5
clear_worker titan-07 rpi5
clear_worker titan-08 rpi5
clear_worker titan-11 rpi5
clear_worker titan-12 rpi4
clear_worker titan-14 rpi4
clear_worker titan-18 rpi4
clear_worker titan-22 amd64
if k get node titan-22 >/dev/null 2>&1; then
k label node titan-22 atlas.bstein.dev/general-compute=last-resort --overwrite=true || true
fi
for node in titan-13 titan-15 titan-17 titan-19; do for node in titan-13 titan-15 titan-17 titan-19; do
if kubectl get node "${node}" >/dev/null 2>&1; then if k get node "${node}" >/dev/null 2>&1; then
kubectl label node "${node}" atlas.bstein.dev/spillover=true --overwrite=true k label node "${node}" atlas.bstein.dev/spillover=true longhorn-host=true --overwrite=true || true
kubectl taint node "${node}" longhorn=true:PreferNoSchedule --overwrite=true k taint node "${node}" longhorn=true:PreferNoSchedule --overwrite=true || true
kubectl taint node "${node}" atlas.bstein.dev/spillover=true:PreferNoSchedule --overwrite=true k taint node "${node}" atlas.bstein.dev/spillover=true:PreferNoSchedule --overwrite=true || true
else else
echo "skipping missing node ${node}" echo "skipping missing node ${node}"
fi fi

View File

@ -59,32 +59,28 @@ spec:
while true; do while true; do
nsenter -t 1 -m -u -i -n -p -- /bin/sh -lc ' nsenter -t 1 -m -u -i -n -p -- /bin/sh -lc '
date -Is date -Is
if ethtool enp5s0 | grep -q "Link detected: yes"; then
ethtool enp5s0 | sed -n "1,25p" || true
ip -br addr show enp5s0 || true
exit 0
fi
echo "link down; attempting recovery"
ip link set enp5s0 up || true ip link set enp5s0 up || true
ethtool --set-eee enp5s0 eee off || true ethtool --set-eee enp5s0 eee off || true
ethtool -s enp5s0 advertise 0x80000000002f autoneg on || ethtool -s enp5s0 autoneg on || true ethtool -s enp5s0 advertise 0x020 autoneg on || true
sleep 5 sleep 3
if ! ethtool enp5s0 | grep -q "Link detected: yes"; then stable=true
echo "link still down after autoneg; trying forced 2.5G" for _ in 1 2 3 4 5; do
ethtool -s enp5s0 speed 2500 duplex full autoneg off || true if ! ethtool enp5s0 | grep -q "Link detected: yes"; then
sleep 3 stable=false
fi fi
sleep 2
if ! ethtool enp5s0 | grep -q "Link detected: yes"; then done
echo "link still down after 2.5G; trying forced 1G"
ethtool -s enp5s0 speed 1000 duplex full autoneg off || true if [ "${stable}" != "true" ]; then
sleep 3 echo "link not stable at 1G-only; falling back to 100M-only autoneg"
ethtool -s enp5s0 advertise 0x008 autoneg on || true
sleep 5
fi fi
ethtool --show-eee enp5s0 || true
ethtool enp5s0 | sed -n "1,45p" || true ethtool enp5s0 | sed -n "1,45p" || true
ip -br addr show enp5s0 || true ip -br addr show enp5s0 || true
' '
sleep 60 sleep 30
done done