ops: add resource guardrails for rpi workers

This commit is contained in:
jenkins 2026-05-19 12:48:40 -03:00
parent c75e0d1b88
commit a3e14ce0f2
21 changed files with 592 additions and 14 deletions

View File

@ -0,0 +1,21 @@
# clusters/atlas/flux-system/platform/descheduler/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: descheduler
namespace: flux-system
annotations:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 30m
path: ./infrastructure/descheduler
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
targetNamespace: kube-system
dependsOn:
- name: helm
- name: core
wait: true

View File

@ -4,6 +4,8 @@ kind: Kustomization
resources:
- core/kustomization.yaml
- helm/kustomization.yaml
- descheduler/kustomization.yaml
- resource-guardrails/kustomization.yaml
- cert-manager/kustomization.yaml
- metallb/kustomization.yaml
- traefik/kustomization.yaml

View File

@ -0,0 +1,19 @@
# clusters/atlas/flux-system/platform/resource-guardrails/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: resource-guardrails
namespace: flux-system
annotations:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
path: ./infrastructure/resource-guardrails
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: core
wait: true

View File

@ -10,5 +10,6 @@ resources:
- coredns-custom.yaml
- coredns-deployment.yaml
- ntp-sync-daemonset.yaml
- workload-profiles.yaml
- ../sources/cert-manager/letsencrypt.yaml
- ../sources/cert-manager/letsencrypt-prod.yaml

View File

@ -0,0 +1,27 @@
# infrastructure/core/workload-profiles.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: atlas-workload-profiles
namespace: kube-system
data:
profiles.yaml: |
profiles:
tiny:
request: { cpu: 25m, memory: 64Mi }
limit: { cpu: 200m, memory: 256Mi }
light:
request: { cpu: 50m, memory: 128Mi }
limit: { cpu: 500m, memory: 512Mi }
standard:
request: { cpu: 250m, memory: 512Mi }
limit: { cpu: "1", memory: 1Gi }
heavy:
request: { cpu: 500m, memory: 1Gi }
limit: { cpu: 1500m, memory: 3Gi }
ci:
request: { cpu: 512m, memory: 512Mi }
limit: { cpu: 1500m, memory: 2Gi }
scavenger:
request: { cpu: 10m, memory: 32Mi }
limit: { cpu: 250m, memory: 256Mi }

View File

@ -0,0 +1,100 @@
# infrastructure/descheduler/helmrelease.yaml
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: descheduler
namespace: kube-system
spec:
interval: 30m
install:
remediation:
retries: 3
upgrade:
remediation:
retries: 3
chart:
spec:
chart: descheduler
version: 0.33.0
sourceRef:
kind: HelmRepository
name: descheduler
namespace: flux-system
values:
kind: CronJob
schedule: "*/20 * * * *"
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3
resources:
requests:
cpu: 50m
memory: 96Mi
limits:
cpu: 200m
memory: 256Mi
deschedulerPolicyAPIVersion: descheduler/v1alpha2
deschedulerPolicy:
maxNoOfPodsToEvictPerNode: 2
maxNoOfPodsToEvictPerNamespace: 2
profiles:
- name: atlas-rpi-balance
pluginConfig:
- name: DefaultEvictor
args:
nodeFit: true
minPodAge: 10m
podProtections:
extraEnabled:
- PodsWithPVC
- PodsWithLocalStorage
- SystemCriticalPods
- name: RemovePodsHavingTooManyRestarts
args:
podRestartThreshold: 12
includingInitContainers: true
- name: RemovePodsViolatingNodeAffinity
args:
nodeAffinityType:
- requiredDuringSchedulingIgnoredDuringExecution
- name: RemovePodsViolatingTopologySpreadConstraint
- name: RemovePodsViolatingNodeTaints
- name: LowNodeUtilization
args:
thresholds:
cpu: 45
memory: 45
pods: 45
targetThresholds:
cpu: 75
memory: 75
pods: 75
plugins:
balance:
enabled:
- RemovePodsViolatingTopologySpreadConstraint
- LowNodeUtilization
deschedule:
enabled:
- RemovePodsHavingTooManyRestarts
- RemovePodsViolatingNodeTaints
- RemovePodsViolatingNodeAffinity
priorityClassName: system-cluster-critical
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: hardware
operator: In
values:
- rpi5
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule

View File

@ -0,0 +1,5 @@
# infrastructure/descheduler/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrelease.yaml

View File

@ -0,0 +1,5 @@
# infrastructure/resource-guardrails/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- limitranges.yaml

View File

@ -0,0 +1,182 @@
# infrastructure/resource-guardrails/limitranges.yaml
apiVersion: v1
kind: List
items:
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: ai
spec: &defaultCompute
limits:
- type: Container
defaultRequest:
cpu: 50m
memory: 96Mi
default:
cpu: 500m
memory: 512Mi
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: bstein-dev-home
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: cert-manager
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: climate
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: comms
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: crypto
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: finance
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: gitea
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: harbor
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: health
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: jellyfin
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: jenkins
spec:
limits:
- type: Container
defaultRequest:
cpu: 100m
memory: 256Mi
default:
cpu: 1500m
memory: 2Gi
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: logging
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: mailu-mailserver
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: maintenance
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: metallb-system
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: monitoring
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: nextcloud
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: outline
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: planka
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: postgres
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: quality
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: sso
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: sui-metrics
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: traefik
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: vault
spec: *defaultCompute
- apiVersion: v1
kind: LimitRange
metadata:
name: atlas-default-compute
namespace: vaultwarden
spec: *defaultCompute

View File

@ -0,0 +1,9 @@
# infrastructure/sources/helm/descheduler.yaml
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: descheduler
namespace: flux-system
spec:
interval: 1h
url: https://kubernetes-sigs.github.io/descheduler/

View File

@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ananace.yaml
- descheduler.yaml
- fluent-bit.yaml
- grafana.yaml
- hashicorp.yaml

View File

@ -4,14 +4,18 @@ kind: Deployment
metadata:
name: monerod
namespace: crypto
labels: { app: monerod }
labels:
app: monerod
atlas.bstein.dev/workload-profile: heavy
spec:
replicas: 1
strategy: { type: Recreate }
selector: { matchLabels: { app: monerod } }
template:
metadata:
labels: { app: monerod }
labels:
app: monerod
atlas.bstein.dev/workload-profile: heavy
spec:
securityContext:
fsGroup: 1000
@ -41,6 +45,7 @@ spec:
- key: kubernetes.io/hostname
operator: NotIn
values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
terminationGracePeriodSeconds: 120
containers:
- name: monerod
image: registry.bstein.dev/crypto/monerod:0.18.4.1
@ -83,7 +88,13 @@ spec:
periodSeconds: 20
timeoutSeconds: 20
failureThreshold: 36
terminationGracePeriodSeconds: 120
resources:
requests:
cpu: 250m
memory: 1Gi
limits:
cpu: 1500m
memory: 3Gi
lifecycle:
preStop:
exec:

View File

@ -4,14 +4,18 @@ kind: Deployment
metadata:
name: monero-p2pool
namespace: crypto
labels: { app: monero-p2pool }
labels:
app: monero-p2pool
atlas.bstein.dev/workload-profile: light
spec:
replicas: 1
selector:
matchLabels: { app: monero-p2pool }
template:
metadata:
labels: { app: monero-p2pool }
labels:
app: monero-p2pool
atlas.bstein.dev/workload-profile: light
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "crypto"
@ -87,6 +91,13 @@ spec:
tcpSocket: { port: 3333 }
initialDelaySeconds: 10
periodSeconds: 10
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
volumeMounts:
- { name: p2pool-bin, mountPath: /opt/p2pool }
volumes:

View File

@ -4,14 +4,18 @@ kind: DaemonSet
metadata:
name: monero-xmrig
namespace: crypto
labels: { app: monero-xmrig }
labels:
app: monero-xmrig
atlas.bstein.dev/workload-profile: scavenger
spec:
selector:
matchLabels: { app: monero-xmrig }
updateStrategy: { type: RollingUpdate }
template:
metadata:
labels: { app: monero-xmrig }
labels:
app: monero-xmrig
atlas.bstein.dev/workload-profile: scavenger
spec:
priorityClassName: scavenger
nodeSelector:
@ -47,3 +51,10 @@ spec:
--donate-level N \
--cpu-priority 1 \
--threads "${THR}" ${EXTRA}
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 250m
memory: 256Mi

View File

@ -6,6 +6,7 @@ metadata:
namespace: gitea
labels:
app: gitea
atlas.bstein.dev/workload-profile: heavy
spec:
replicas: 1
selector:
@ -20,6 +21,7 @@ spec:
metadata:
labels:
app: gitea
atlas.bstein.dev/workload-profile: heavy
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-init-first: "true"
@ -197,6 +199,13 @@ spec:
value: "true"
- name: SSH_PORT
value: "2242"
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1500m
memory: 2Gi
volumeMounts:
- name: gitea-data
mountPath: /data

View File

@ -77,10 +77,16 @@ spec:
internal:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-redis
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"}
resources:
requests:
cpu: 50m
memory: 96Mi
limits:
cpu: 250m
memory: 256Mi
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
@ -114,10 +120,16 @@ spec:
core:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-core
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"}
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 750m
memory: 1Gi
serviceAccountName: harbor-vault-sync
automountServiceAccountToken: true
existingSecret: harbor-core
@ -180,10 +192,16 @@ spec:
jobservice:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-jobservice
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
serviceAccountName: harbor-vault-sync
automountServiceAccountToken: true
existingSecret: harbor-jobservice
@ -227,10 +245,16 @@ spec:
portal:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-portal
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"}
resources:
requests:
cpu: 25m
memory: 64Mi
limits:
cpu: 200m
memory: 128Mi
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
@ -255,7 +279,13 @@ spec:
registry:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 1
memory: 1Gi
registry:
image:
repository: registry.bstein.dev/infra/harbor-registry
@ -338,10 +368,16 @@ spec:
nginx:
nodeSelector:
ananke.bstein.dev/harbor-bootstrap: "true"
kubernetes.io/hostname: titan-11
image:
repository: registry.bstein.dev/infra/harbor-nginx
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"}
resources:
requests:
cpu: 25m
memory: 64Mi
limits:
cpu: 200m
memory: 128Mi
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:

View File

@ -6,6 +6,7 @@ metadata:
namespace: jellyfin
labels:
app: jellyfin
atlas.bstein.dev/workload-profile: heavy
spec:
replicas: 1
strategy:
@ -20,6 +21,7 @@ spec:
metadata:
labels:
app: jellyfin
atlas.bstein.dev/workload-profile: heavy
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "pegasus"
@ -134,6 +136,9 @@ spec:
requests:
cpu: "500m"
memory: 1Gi
limits:
cpu: "1500m"
memory: 3Gi
volumeMounts:
- name: jellyfin-vault-entrypoint
mountPath: /entrypoint.sh

View File

@ -474,7 +474,7 @@ data:
plainText
clouds:
- kubernetes:
containerCapStr: "4"
containerCapStr: "3"
connectTimeout: "20"
readTimeout: "90"
jenkinsUrl: "http://jenkins.jenkins.svc.cluster.local:8080"

View File

@ -32,6 +32,7 @@ resources:
- disable-k3s-traefik-daemonset.yaml
- oneoffs/k3s-traefik-cleanup-job.yaml
- node-nofile-daemonset.yaml
- rpi-resource-reservation-daemonset.yaml
- metis-sentinel-amd64-daemonset.yaml
- metis-sentinel-arm64-daemonset.yaml
- k3s-agent-restart-daemonset.yaml
@ -84,3 +85,9 @@ configMapGenerator:
- node_image_sweeper.sh=scripts/node_image_sweeper.sh
options:
disableNameSuffixHash: true
- name: rpi-resource-reservation-script
namespace: maintenance
files:
- rpi_resource_reservation.sh=scripts/rpi_resource_reservation.sh
options:
disableNameSuffixHash: true

View File

@ -0,0 +1,69 @@
# services/maintenance/rpi-resource-reservation-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: rpi-resource-reservation
namespace: maintenance
spec:
selector:
matchLabels:
app: rpi-resource-reservation
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app: rpi-resource-reservation
spec:
serviceAccountName: node-nofile
nodeSelector:
node-role.kubernetes.io/worker: "true"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi4
- rpi5
tolerations:
- key: node.kubernetes.io/unschedulable
operator: Exists
effect: NoSchedule
- key: node.kubernetes.io/not-ready
operator: Exists
effect: NoExecute
- key: node.kubernetes.io/unreachable
operator: Exists
effect: NoExecute
containers:
- name: reservation
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/usr/bin/env", "bash"]
args: ["/scripts/rpi_resource_reservation.sh"]
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 96Mi
securityContext:
privileged: true
runAsUser: 0
volumeMounts:
- name: host-root
mountPath: /host
- name: script
mountPath: /scripts
readOnly: true
volumes:
- name: host-root
hostPath:
path: /
- name: script
configMap:
name: rpi-resource-reservation-script
defaultMode: 0555

View File

@ -0,0 +1,47 @@
#!/usr/bin/env bash
set -euo pipefail
host_root="/host"
unit="k3s-agent"
unit_file="${host_root}/etc/systemd/system/${unit}.service"
config_dir="${host_root}/etc/rancher/k3s/config.yaml.d"
config_file="${config_dir}/90-atlas-rpi-reservations.yaml"
if [ ! -f "${unit_file}" ]; then
echo "k3s-agent unit not found; this guardrail only manages worker agents"
sleep infinity
fi
tmp_file="$(mktemp)"
cat > "${tmp_file}" <<'EOF'
# Managed by Flux via services/maintenance/scripts/rpi_resource_reservation.sh.
# Keep RPi workers below saturation so kubelet and the OS keep enough headroom
# to evict or recover before the board wedges.
kubelet-arg+:
- "system-reserved=cpu=250m,memory=384Mi,ephemeral-storage=1Gi"
- "kube-reserved=cpu=150m,memory=256Mi,ephemeral-storage=1Gi"
- "eviction-hard=memory.available<512Mi,nodefs.available<10%,imagefs.available<10%"
- "eviction-soft=memory.available<768Mi,nodefs.available<15%,imagefs.available<15%"
- "eviction-soft-grace-period=memory.available=1m,nodefs.available=2m,imagefs.available=2m"
- "eviction-max-pod-grace-period=60"
EOF
changed=0
if [ ! -f "${config_file}" ] || ! cmp -s "${tmp_file}" "${config_file}"; then
mkdir -p "${config_dir}"
install -m 0644 "${tmp_file}" "${config_file}"
changed=1
fi
rm -f "${tmp_file}"
if [ "${changed}" -eq 1 ]; then
delay="$(( (RANDOM % 420) + 30 ))"
echo "updated ${config_file}; restarting ${unit} after ${delay}s"
sleep "${delay}"
chroot "${host_root}" /bin/systemctl daemon-reload
chroot "${host_root}" /bin/systemctl restart "${unit}"
else
echo "${config_file} already up to date"
fi
sleep infinity