maintenance: stabilize recovered worker nodes
This commit is contained in:
parent
46c3e97688
commit
c7edc81239
@ -5,17 +5,17 @@ metadata:
|
|||||||
name: node-prefer-noschedule
|
name: node-prefer-noschedule
|
||||||
namespace: kube-system
|
namespace: kube-system
|
||||||
spec:
|
spec:
|
||||||
schedule: "*/20 * * * *"
|
schedule: "* * * * *"
|
||||||
concurrencyPolicy: Forbid
|
concurrencyPolicy: Replace
|
||||||
successfulJobsHistoryLimit: 1
|
successfulJobsHistoryLimit: 1
|
||||||
failedJobsHistoryLimit: 3
|
failedJobsHistoryLimit: 3
|
||||||
jobTemplate:
|
jobTemplate:
|
||||||
spec:
|
spec:
|
||||||
backoffLimit: 1
|
backoffLimit: 0
|
||||||
template:
|
template:
|
||||||
spec:
|
spec:
|
||||||
serviceAccountName: node-prefer-noschedule
|
serviceAccountName: node-prefer-noschedule
|
||||||
restartPolicy: OnFailure
|
restartPolicy: Never
|
||||||
containers:
|
containers:
|
||||||
- name: taint
|
- name: taint
|
||||||
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
||||||
@ -24,11 +24,42 @@ spec:
|
|||||||
- bash
|
- bash
|
||||||
- -ceu
|
- -ceu
|
||||||
- |
|
- |
|
||||||
|
k() {
|
||||||
|
kubectl --request-timeout=10s "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
clear_worker() {
|
||||||
|
local node="${1}"
|
||||||
|
local hardware="${2}"
|
||||||
|
if k get node "${node}" >/dev/null 2>&1; then
|
||||||
|
k label node "${node}" node-role.kubernetes.io/worker=true "hardware=${hardware}" --overwrite=true || true
|
||||||
|
k label node "${node}" atlas.bstein.dev/spillover- || true
|
||||||
|
k taint node "${node}" node.kubernetes.io/unschedulable:NoSchedule- || true
|
||||||
|
k uncordon "${node}" || true
|
||||||
|
else
|
||||||
|
echo "skipping missing node ${node}"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
clear_worker titan-04 rpi5
|
||||||
|
clear_worker titan-05 rpi5
|
||||||
|
clear_worker titan-07 rpi5
|
||||||
|
clear_worker titan-08 rpi5
|
||||||
|
clear_worker titan-11 rpi5
|
||||||
|
clear_worker titan-12 rpi4
|
||||||
|
clear_worker titan-14 rpi4
|
||||||
|
clear_worker titan-18 rpi4
|
||||||
|
clear_worker titan-22 amd64
|
||||||
|
|
||||||
|
if k get node titan-22 >/dev/null 2>&1; then
|
||||||
|
k label node titan-22 atlas.bstein.dev/general-compute=last-resort --overwrite=true || true
|
||||||
|
fi
|
||||||
|
|
||||||
for node in titan-13 titan-15 titan-17 titan-19; do
|
for node in titan-13 titan-15 titan-17 titan-19; do
|
||||||
if kubectl get node "${node}" >/dev/null 2>&1; then
|
if k get node "${node}" >/dev/null 2>&1; then
|
||||||
kubectl label node "${node}" atlas.bstein.dev/spillover=true --overwrite=true
|
k label node "${node}" atlas.bstein.dev/spillover=true longhorn-host=true --overwrite=true || true
|
||||||
kubectl taint node "${node}" longhorn=true:PreferNoSchedule --overwrite=true
|
k taint node "${node}" longhorn=true:PreferNoSchedule --overwrite=true || true
|
||||||
kubectl taint node "${node}" atlas.bstein.dev/spillover=true:PreferNoSchedule --overwrite=true
|
k taint node "${node}" atlas.bstein.dev/spillover=true:PreferNoSchedule --overwrite=true || true
|
||||||
else
|
else
|
||||||
echo "skipping missing node ${node}"
|
echo "skipping missing node ${node}"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -59,32 +59,28 @@ spec:
|
|||||||
while true; do
|
while true; do
|
||||||
nsenter -t 1 -m -u -i -n -p -- /bin/sh -lc '
|
nsenter -t 1 -m -u -i -n -p -- /bin/sh -lc '
|
||||||
date -Is
|
date -Is
|
||||||
if ethtool enp5s0 | grep -q "Link detected: yes"; then
|
|
||||||
ethtool enp5s0 | sed -n "1,25p" || true
|
|
||||||
ip -br addr show enp5s0 || true
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "link down; attempting recovery"
|
|
||||||
ip link set enp5s0 up || true
|
ip link set enp5s0 up || true
|
||||||
ethtool --set-eee enp5s0 eee off || true
|
ethtool --set-eee enp5s0 eee off || true
|
||||||
ethtool -s enp5s0 advertise 0x80000000002f autoneg on || ethtool -s enp5s0 autoneg on || true
|
ethtool -s enp5s0 advertise 0x020 autoneg on || true
|
||||||
|
sleep 3
|
||||||
|
|
||||||
|
stable=true
|
||||||
|
for _ in 1 2 3 4 5; do
|
||||||
|
if ! ethtool enp5s0 | grep -q "Link detected: yes"; then
|
||||||
|
stable=false
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${stable}" != "true" ]; then
|
||||||
|
echo "link not stable at 1G-only; falling back to 100M-only autoneg"
|
||||||
|
ethtool -s enp5s0 advertise 0x008 autoneg on || true
|
||||||
sleep 5
|
sleep 5
|
||||||
|
|
||||||
if ! ethtool enp5s0 | grep -q "Link detected: yes"; then
|
|
||||||
echo "link still down after autoneg; trying forced 2.5G"
|
|
||||||
ethtool -s enp5s0 speed 2500 duplex full autoneg off || true
|
|
||||||
sleep 3
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! ethtool enp5s0 | grep -q "Link detected: yes"; then
|
|
||||||
echo "link still down after 2.5G; trying forced 1G"
|
|
||||||
ethtool -s enp5s0 speed 1000 duplex full autoneg off || true
|
|
||||||
sleep 3
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
ethtool --show-eee enp5s0 || true
|
||||||
ethtool enp5s0 | sed -n "1,45p" || true
|
ethtool enp5s0 | sed -n "1,45p" || true
|
||||||
ip -br addr show enp5s0 || true
|
ip -br addr show enp5s0 || true
|
||||||
'
|
'
|
||||||
sleep 60
|
sleep 30
|
||||||
done
|
done
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user