From f6167b4ac16a2da48606c696eb481cfc3fd3bbf1 Mon Sep 17 00:00:00 2001 From: jenkins Date: Fri, 19 Jun 2026 15:44:22 -0300 Subject: [PATCH] core: repair node role reconciler --- .../core/node-prefer-noschedule-cronjob.yaml | 22 +++++++++++++++---- .../core/node-prefer-noschedule-rbac.yaml | 2 +- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/infrastructure/core/node-prefer-noschedule-cronjob.yaml b/infrastructure/core/node-prefer-noschedule-cronjob.yaml index 405c3bbb..95b41313 100644 --- a/infrastructure/core/node-prefer-noschedule-cronjob.yaml +++ b/infrastructure/core/node-prefer-noschedule-cronjob.yaml @@ -24,8 +24,17 @@ spec: - bash - -ceu - | + KUBE_TOKEN_PATH="/var/run/secrets/kubernetes.io/serviceaccount/token" + KUBE_CA_PATH="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + KUBE_SERVER="https://${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT_HTTPS:-443}" + k() { - kubectl --request-timeout=10s "$@" + kubectl \ + --server="${KUBE_SERVER}" \ + --certificate-authority="${KUBE_CA_PATH}" \ + --token="$(cat "${KUBE_TOKEN_PATH}")" \ + --request-timeout=10s \ + "$@" } clear_worker() { @@ -33,9 +42,9 @@ spec: local hardware="${2}" if k get node "${node}" >/dev/null 2>&1; then k label node "${node}" node-role.kubernetes.io/worker=true "hardware=${hardware}" --overwrite=true || true + k label node "${node}" node-role.kubernetes.io/storage-backbone- || true k label node "${node}" atlas.bstein.dev/spillover- || true - k taint node "${node}" node.kubernetes.io/unschedulable:NoSchedule- || true - k uncordon "${node}" || true + # Recovery cordons are owned by Ananke, not this role reconciler. else echo "skipping missing node ${node}" fi @@ -71,7 +80,12 @@ spec: for node in titan-13 titan-15 titan-17 titan-19; do if k get node "${node}" >/dev/null 2>&1; then - k label node "${node}" atlas.bstein.dev/spillover=true longhorn-host=true --overwrite=true || true + k label node "${node}" \ + atlas.bstein.dev/spillover=true \ + longhorn-host=true \ + node-role.kubernetes.io/worker=true \ + node-role.kubernetes.io/storage-backbone=true \ + --overwrite=true || true k taint node "${node}" longhorn=true:PreferNoSchedule --overwrite=true || true k taint node "${node}" atlas.bstein.dev/spillover=true:PreferNoSchedule --overwrite=true || true else diff --git a/infrastructure/core/node-prefer-noschedule-rbac.yaml b/infrastructure/core/node-prefer-noschedule-rbac.yaml index 73d69525..e1ff0e78 100644 --- a/infrastructure/core/node-prefer-noschedule-rbac.yaml +++ b/infrastructure/core/node-prefer-noschedule-rbac.yaml @@ -6,7 +6,7 @@ metadata: rules: - apiGroups: [""] resources: ["nodes"] - verbs: ["get", "list", "patch"] + verbs: ["get", "list", "patch", "update"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding