ops: add resource guardrails for rpi workers
This commit is contained in:
parent
c75e0d1b88
commit
a3e14ce0f2
@ -0,0 +1,21 @@
|
|||||||
|
# clusters/atlas/flux-system/platform/descheduler/kustomization.yaml
|
||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: descheduler
|
||||||
|
namespace: flux-system
|
||||||
|
annotations:
|
||||||
|
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
|
||||||
|
spec:
|
||||||
|
interval: 30m
|
||||||
|
path: ./infrastructure/descheduler
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: flux-system
|
||||||
|
namespace: flux-system
|
||||||
|
targetNamespace: kube-system
|
||||||
|
dependsOn:
|
||||||
|
- name: helm
|
||||||
|
- name: core
|
||||||
|
wait: true
|
||||||
@ -4,6 +4,8 @@ kind: Kustomization
|
|||||||
resources:
|
resources:
|
||||||
- core/kustomization.yaml
|
- core/kustomization.yaml
|
||||||
- helm/kustomization.yaml
|
- helm/kustomization.yaml
|
||||||
|
- descheduler/kustomization.yaml
|
||||||
|
- resource-guardrails/kustomization.yaml
|
||||||
- cert-manager/kustomization.yaml
|
- cert-manager/kustomization.yaml
|
||||||
- metallb/kustomization.yaml
|
- metallb/kustomization.yaml
|
||||||
- traefik/kustomization.yaml
|
- traefik/kustomization.yaml
|
||||||
|
|||||||
@ -0,0 +1,19 @@
|
|||||||
|
# clusters/atlas/flux-system/platform/resource-guardrails/kustomization.yaml
|
||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: resource-guardrails
|
||||||
|
namespace: flux-system
|
||||||
|
annotations:
|
||||||
|
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
path: ./infrastructure/resource-guardrails
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: flux-system
|
||||||
|
namespace: flux-system
|
||||||
|
dependsOn:
|
||||||
|
- name: core
|
||||||
|
wait: true
|
||||||
@ -10,5 +10,6 @@ resources:
|
|||||||
- coredns-custom.yaml
|
- coredns-custom.yaml
|
||||||
- coredns-deployment.yaml
|
- coredns-deployment.yaml
|
||||||
- ntp-sync-daemonset.yaml
|
- ntp-sync-daemonset.yaml
|
||||||
|
- workload-profiles.yaml
|
||||||
- ../sources/cert-manager/letsencrypt.yaml
|
- ../sources/cert-manager/letsencrypt.yaml
|
||||||
- ../sources/cert-manager/letsencrypt-prod.yaml
|
- ../sources/cert-manager/letsencrypt-prod.yaml
|
||||||
|
|||||||
27
infrastructure/core/workload-profiles.yaml
Normal file
27
infrastructure/core/workload-profiles.yaml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# infrastructure/core/workload-profiles.yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: atlas-workload-profiles
|
||||||
|
namespace: kube-system
|
||||||
|
data:
|
||||||
|
profiles.yaml: |
|
||||||
|
profiles:
|
||||||
|
tiny:
|
||||||
|
request: { cpu: 25m, memory: 64Mi }
|
||||||
|
limit: { cpu: 200m, memory: 256Mi }
|
||||||
|
light:
|
||||||
|
request: { cpu: 50m, memory: 128Mi }
|
||||||
|
limit: { cpu: 500m, memory: 512Mi }
|
||||||
|
standard:
|
||||||
|
request: { cpu: 250m, memory: 512Mi }
|
||||||
|
limit: { cpu: "1", memory: 1Gi }
|
||||||
|
heavy:
|
||||||
|
request: { cpu: 500m, memory: 1Gi }
|
||||||
|
limit: { cpu: 1500m, memory: 3Gi }
|
||||||
|
ci:
|
||||||
|
request: { cpu: 512m, memory: 512Mi }
|
||||||
|
limit: { cpu: 1500m, memory: 2Gi }
|
||||||
|
scavenger:
|
||||||
|
request: { cpu: 10m, memory: 32Mi }
|
||||||
|
limit: { cpu: 250m, memory: 256Mi }
|
||||||
100
infrastructure/descheduler/helmrelease.yaml
Normal file
100
infrastructure/descheduler/helmrelease.yaml
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
# infrastructure/descheduler/helmrelease.yaml
|
||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: descheduler
|
||||||
|
namespace: kube-system
|
||||||
|
spec:
|
||||||
|
interval: 30m
|
||||||
|
install:
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
upgrade:
|
||||||
|
remediation:
|
||||||
|
retries: 3
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: descheduler
|
||||||
|
version: 0.33.0
|
||||||
|
sourceRef:
|
||||||
|
kind: HelmRepository
|
||||||
|
name: descheduler
|
||||||
|
namespace: flux-system
|
||||||
|
values:
|
||||||
|
kind: CronJob
|
||||||
|
schedule: "*/20 * * * *"
|
||||||
|
successfulJobsHistoryLimit: 1
|
||||||
|
failedJobsHistoryLimit: 3
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 96Mi
|
||||||
|
limits:
|
||||||
|
cpu: 200m
|
||||||
|
memory: 256Mi
|
||||||
|
deschedulerPolicyAPIVersion: descheduler/v1alpha2
|
||||||
|
deschedulerPolicy:
|
||||||
|
maxNoOfPodsToEvictPerNode: 2
|
||||||
|
maxNoOfPodsToEvictPerNamespace: 2
|
||||||
|
profiles:
|
||||||
|
- name: atlas-rpi-balance
|
||||||
|
pluginConfig:
|
||||||
|
- name: DefaultEvictor
|
||||||
|
args:
|
||||||
|
nodeFit: true
|
||||||
|
minPodAge: 10m
|
||||||
|
podProtections:
|
||||||
|
extraEnabled:
|
||||||
|
- PodsWithPVC
|
||||||
|
- PodsWithLocalStorage
|
||||||
|
- SystemCriticalPods
|
||||||
|
- name: RemovePodsHavingTooManyRestarts
|
||||||
|
args:
|
||||||
|
podRestartThreshold: 12
|
||||||
|
includingInitContainers: true
|
||||||
|
- name: RemovePodsViolatingNodeAffinity
|
||||||
|
args:
|
||||||
|
nodeAffinityType:
|
||||||
|
- requiredDuringSchedulingIgnoredDuringExecution
|
||||||
|
- name: RemovePodsViolatingTopologySpreadConstraint
|
||||||
|
- name: RemovePodsViolatingNodeTaints
|
||||||
|
- name: LowNodeUtilization
|
||||||
|
args:
|
||||||
|
thresholds:
|
||||||
|
cpu: 45
|
||||||
|
memory: 45
|
||||||
|
pods: 45
|
||||||
|
targetThresholds:
|
||||||
|
cpu: 75
|
||||||
|
memory: 75
|
||||||
|
pods: 75
|
||||||
|
plugins:
|
||||||
|
balance:
|
||||||
|
enabled:
|
||||||
|
- RemovePodsViolatingTopologySpreadConstraint
|
||||||
|
- LowNodeUtilization
|
||||||
|
deschedule:
|
||||||
|
enabled:
|
||||||
|
- RemovePodsHavingTooManyRestarts
|
||||||
|
- RemovePodsViolatingNodeTaints
|
||||||
|
- RemovePodsViolatingNodeAffinity
|
||||||
|
priorityClassName: system-cluster-critical
|
||||||
|
nodeSelector:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
- weight: 100
|
||||||
|
preference:
|
||||||
|
matchExpressions:
|
||||||
|
- key: hardware
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- rpi5
|
||||||
|
tolerations:
|
||||||
|
- key: node-role.kubernetes.io/control-plane
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
|
- key: node-role.kubernetes.io/master
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
5
infrastructure/descheduler/kustomization.yaml
Normal file
5
infrastructure/descheduler/kustomization.yaml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# infrastructure/descheduler/kustomization.yaml
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- helmrelease.yaml
|
||||||
5
infrastructure/resource-guardrails/kustomization.yaml
Normal file
5
infrastructure/resource-guardrails/kustomization.yaml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# infrastructure/resource-guardrails/kustomization.yaml
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- limitranges.yaml
|
||||||
182
infrastructure/resource-guardrails/limitranges.yaml
Normal file
182
infrastructure/resource-guardrails/limitranges.yaml
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
# infrastructure/resource-guardrails/limitranges.yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: List
|
||||||
|
items:
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: ai
|
||||||
|
spec: &defaultCompute
|
||||||
|
limits:
|
||||||
|
- type: Container
|
||||||
|
defaultRequest:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 96Mi
|
||||||
|
default:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 512Mi
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: bstein-dev-home
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: cert-manager
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: climate
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: comms
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: crypto
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: finance
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: gitea
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: harbor
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: health
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: jellyfin
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: jenkins
|
||||||
|
spec:
|
||||||
|
limits:
|
||||||
|
- type: Container
|
||||||
|
defaultRequest:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 256Mi
|
||||||
|
default:
|
||||||
|
cpu: 1500m
|
||||||
|
memory: 2Gi
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: logging
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: mailu-mailserver
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: maintenance
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: metallb-system
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: monitoring
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: nextcloud
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: outline
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: planka
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: postgres
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: quality
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: sso
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: sui-metrics
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: traefik
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: vault
|
||||||
|
spec: *defaultCompute
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: atlas-default-compute
|
||||||
|
namespace: vaultwarden
|
||||||
|
spec: *defaultCompute
|
||||||
9
infrastructure/sources/helm/descheduler.yaml
Normal file
9
infrastructure/sources/helm/descheduler.yaml
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
# infrastructure/sources/helm/descheduler.yaml
|
||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: HelmRepository
|
||||||
|
metadata:
|
||||||
|
name: descheduler
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 1h
|
||||||
|
url: https://kubernetes-sigs.github.io/descheduler/
|
||||||
@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
|||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
resources:
|
resources:
|
||||||
- ananace.yaml
|
- ananace.yaml
|
||||||
|
- descheduler.yaml
|
||||||
- fluent-bit.yaml
|
- fluent-bit.yaml
|
||||||
- grafana.yaml
|
- grafana.yaml
|
||||||
- hashicorp.yaml
|
- hashicorp.yaml
|
||||||
|
|||||||
@ -4,14 +4,18 @@ kind: Deployment
|
|||||||
metadata:
|
metadata:
|
||||||
name: monerod
|
name: monerod
|
||||||
namespace: crypto
|
namespace: crypto
|
||||||
labels: { app: monerod }
|
labels:
|
||||||
|
app: monerod
|
||||||
|
atlas.bstein.dev/workload-profile: heavy
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
strategy: { type: Recreate }
|
strategy: { type: Recreate }
|
||||||
selector: { matchLabels: { app: monerod } }
|
selector: { matchLabels: { app: monerod } }
|
||||||
template:
|
template:
|
||||||
metadata:
|
metadata:
|
||||||
labels: { app: monerod }
|
labels:
|
||||||
|
app: monerod
|
||||||
|
atlas.bstein.dev/workload-profile: heavy
|
||||||
spec:
|
spec:
|
||||||
securityContext:
|
securityContext:
|
||||||
fsGroup: 1000
|
fsGroup: 1000
|
||||||
@ -41,6 +45,7 @@ spec:
|
|||||||
- key: kubernetes.io/hostname
|
- key: kubernetes.io/hostname
|
||||||
operator: NotIn
|
operator: NotIn
|
||||||
values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
|
values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
|
||||||
|
terminationGracePeriodSeconds: 120
|
||||||
containers:
|
containers:
|
||||||
- name: monerod
|
- name: monerod
|
||||||
image: registry.bstein.dev/crypto/monerod:0.18.4.1
|
image: registry.bstein.dev/crypto/monerod:0.18.4.1
|
||||||
@ -83,7 +88,13 @@ spec:
|
|||||||
periodSeconds: 20
|
periodSeconds: 20
|
||||||
timeoutSeconds: 20
|
timeoutSeconds: 20
|
||||||
failureThreshold: 36
|
failureThreshold: 36
|
||||||
terminationGracePeriodSeconds: 120
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 250m
|
||||||
|
memory: 1Gi
|
||||||
|
limits:
|
||||||
|
cpu: 1500m
|
||||||
|
memory: 3Gi
|
||||||
lifecycle:
|
lifecycle:
|
||||||
preStop:
|
preStop:
|
||||||
exec:
|
exec:
|
||||||
|
|||||||
@ -4,14 +4,18 @@ kind: Deployment
|
|||||||
metadata:
|
metadata:
|
||||||
name: monero-p2pool
|
name: monero-p2pool
|
||||||
namespace: crypto
|
namespace: crypto
|
||||||
labels: { app: monero-p2pool }
|
labels:
|
||||||
|
app: monero-p2pool
|
||||||
|
atlas.bstein.dev/workload-profile: light
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
matchLabels: { app: monero-p2pool }
|
matchLabels: { app: monero-p2pool }
|
||||||
template:
|
template:
|
||||||
metadata:
|
metadata:
|
||||||
labels: { app: monero-p2pool }
|
labels:
|
||||||
|
app: monero-p2pool
|
||||||
|
atlas.bstein.dev/workload-profile: light
|
||||||
annotations:
|
annotations:
|
||||||
vault.hashicorp.com/agent-inject: "true"
|
vault.hashicorp.com/agent-inject: "true"
|
||||||
vault.hashicorp.com/role: "crypto"
|
vault.hashicorp.com/role: "crypto"
|
||||||
@ -87,6 +91,13 @@ spec:
|
|||||||
tcpSocket: { port: 3333 }
|
tcpSocket: { port: 3333 }
|
||||||
initialDelaySeconds: 10
|
initialDelaySeconds: 10
|
||||||
periodSeconds: 10
|
periodSeconds: 10
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 128Mi
|
||||||
|
limits:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 512Mi
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- { name: p2pool-bin, mountPath: /opt/p2pool }
|
- { name: p2pool-bin, mountPath: /opt/p2pool }
|
||||||
volumes:
|
volumes:
|
||||||
|
|||||||
@ -4,14 +4,18 @@ kind: DaemonSet
|
|||||||
metadata:
|
metadata:
|
||||||
name: monero-xmrig
|
name: monero-xmrig
|
||||||
namespace: crypto
|
namespace: crypto
|
||||||
labels: { app: monero-xmrig }
|
labels:
|
||||||
|
app: monero-xmrig
|
||||||
|
atlas.bstein.dev/workload-profile: scavenger
|
||||||
spec:
|
spec:
|
||||||
selector:
|
selector:
|
||||||
matchLabels: { app: monero-xmrig }
|
matchLabels: { app: monero-xmrig }
|
||||||
updateStrategy: { type: RollingUpdate }
|
updateStrategy: { type: RollingUpdate }
|
||||||
template:
|
template:
|
||||||
metadata:
|
metadata:
|
||||||
labels: { app: monero-xmrig }
|
labels:
|
||||||
|
app: monero-xmrig
|
||||||
|
atlas.bstein.dev/workload-profile: scavenger
|
||||||
spec:
|
spec:
|
||||||
priorityClassName: scavenger
|
priorityClassName: scavenger
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
@ -47,3 +51,10 @@ spec:
|
|||||||
--donate-level N \
|
--donate-level N \
|
||||||
--cpu-priority 1 \
|
--cpu-priority 1 \
|
||||||
--threads "${THR}" ${EXTRA}
|
--threads "${THR}" ${EXTRA}
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 10m
|
||||||
|
memory: 32Mi
|
||||||
|
limits:
|
||||||
|
cpu: 250m
|
||||||
|
memory: 256Mi
|
||||||
|
|||||||
@ -6,6 +6,7 @@ metadata:
|
|||||||
namespace: gitea
|
namespace: gitea
|
||||||
labels:
|
labels:
|
||||||
app: gitea
|
app: gitea
|
||||||
|
atlas.bstein.dev/workload-profile: heavy
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@ -20,6 +21,7 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: gitea
|
app: gitea
|
||||||
|
atlas.bstein.dev/workload-profile: heavy
|
||||||
annotations:
|
annotations:
|
||||||
vault.hashicorp.com/agent-inject: "true"
|
vault.hashicorp.com/agent-inject: "true"
|
||||||
vault.hashicorp.com/agent-init-first: "true"
|
vault.hashicorp.com/agent-init-first: "true"
|
||||||
@ -197,6 +199,13 @@ spec:
|
|||||||
value: "true"
|
value: "true"
|
||||||
- name: SSH_PORT
|
- name: SSH_PORT
|
||||||
value: "2242"
|
value: "2242"
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 1Gi
|
||||||
|
limits:
|
||||||
|
cpu: 1500m
|
||||||
|
memory: 2Gi
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: gitea-data
|
- name: gitea-data
|
||||||
mountPath: /data
|
mountPath: /data
|
||||||
|
|||||||
@ -77,10 +77,16 @@ spec:
|
|||||||
internal:
|
internal:
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
kubernetes.io/hostname: titan-11
|
|
||||||
image:
|
image:
|
||||||
repository: registry.bstein.dev/infra/harbor-redis
|
repository: registry.bstein.dev/infra/harbor-redis
|
||||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"}
|
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"}
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 96Mi
|
||||||
|
limits:
|
||||||
|
cpu: 250m
|
||||||
|
memory: 256Mi
|
||||||
affinity:
|
affinity:
|
||||||
nodeAffinity:
|
nodeAffinity:
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
@ -114,10 +120,16 @@ spec:
|
|||||||
core:
|
core:
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
kubernetes.io/hostname: titan-11
|
|
||||||
image:
|
image:
|
||||||
repository: registry.bstein.dev/infra/harbor-core
|
repository: registry.bstein.dev/infra/harbor-core
|
||||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"}
|
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"}
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 256Mi
|
||||||
|
limits:
|
||||||
|
cpu: 750m
|
||||||
|
memory: 1Gi
|
||||||
serviceAccountName: harbor-vault-sync
|
serviceAccountName: harbor-vault-sync
|
||||||
automountServiceAccountToken: true
|
automountServiceAccountToken: true
|
||||||
existingSecret: harbor-core
|
existingSecret: harbor-core
|
||||||
@ -180,10 +192,16 @@ spec:
|
|||||||
jobservice:
|
jobservice:
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
kubernetes.io/hostname: titan-11
|
|
||||||
image:
|
image:
|
||||||
repository: registry.bstein.dev/infra/harbor-jobservice
|
repository: registry.bstein.dev/infra/harbor-jobservice
|
||||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
|
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 128Mi
|
||||||
|
limits:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 512Mi
|
||||||
serviceAccountName: harbor-vault-sync
|
serviceAccountName: harbor-vault-sync
|
||||||
automountServiceAccountToken: true
|
automountServiceAccountToken: true
|
||||||
existingSecret: harbor-jobservice
|
existingSecret: harbor-jobservice
|
||||||
@ -227,10 +245,16 @@ spec:
|
|||||||
portal:
|
portal:
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
kubernetes.io/hostname: titan-11
|
|
||||||
image:
|
image:
|
||||||
repository: registry.bstein.dev/infra/harbor-portal
|
repository: registry.bstein.dev/infra/harbor-portal
|
||||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"}
|
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"}
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 25m
|
||||||
|
memory: 64Mi
|
||||||
|
limits:
|
||||||
|
cpu: 200m
|
||||||
|
memory: 128Mi
|
||||||
affinity:
|
affinity:
|
||||||
nodeAffinity:
|
nodeAffinity:
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
@ -255,7 +279,13 @@ spec:
|
|||||||
registry:
|
registry:
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
kubernetes.io/hostname: titan-11
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 256Mi
|
||||||
|
limits:
|
||||||
|
cpu: 1
|
||||||
|
memory: 1Gi
|
||||||
registry:
|
registry:
|
||||||
image:
|
image:
|
||||||
repository: registry.bstein.dev/infra/harbor-registry
|
repository: registry.bstein.dev/infra/harbor-registry
|
||||||
@ -338,10 +368,16 @@ spec:
|
|||||||
nginx:
|
nginx:
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||||
kubernetes.io/hostname: titan-11
|
|
||||||
image:
|
image:
|
||||||
repository: registry.bstein.dev/infra/harbor-nginx
|
repository: registry.bstein.dev/infra/harbor-nginx
|
||||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"}
|
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"}
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 25m
|
||||||
|
memory: 64Mi
|
||||||
|
limits:
|
||||||
|
cpu: 200m
|
||||||
|
memory: 128Mi
|
||||||
affinity:
|
affinity:
|
||||||
nodeAffinity:
|
nodeAffinity:
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
|||||||
@ -6,6 +6,7 @@ metadata:
|
|||||||
namespace: jellyfin
|
namespace: jellyfin
|
||||||
labels:
|
labels:
|
||||||
app: jellyfin
|
app: jellyfin
|
||||||
|
atlas.bstein.dev/workload-profile: heavy
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
strategy:
|
strategy:
|
||||||
@ -20,6 +21,7 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: jellyfin
|
app: jellyfin
|
||||||
|
atlas.bstein.dev/workload-profile: heavy
|
||||||
annotations:
|
annotations:
|
||||||
vault.hashicorp.com/agent-inject: "true"
|
vault.hashicorp.com/agent-inject: "true"
|
||||||
vault.hashicorp.com/role: "pegasus"
|
vault.hashicorp.com/role: "pegasus"
|
||||||
@ -134,6 +136,9 @@ spec:
|
|||||||
requests:
|
requests:
|
||||||
cpu: "500m"
|
cpu: "500m"
|
||||||
memory: 1Gi
|
memory: 1Gi
|
||||||
|
limits:
|
||||||
|
cpu: "1500m"
|
||||||
|
memory: 3Gi
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: jellyfin-vault-entrypoint
|
- name: jellyfin-vault-entrypoint
|
||||||
mountPath: /entrypoint.sh
|
mountPath: /entrypoint.sh
|
||||||
|
|||||||
@ -474,7 +474,7 @@ data:
|
|||||||
plainText
|
plainText
|
||||||
clouds:
|
clouds:
|
||||||
- kubernetes:
|
- kubernetes:
|
||||||
containerCapStr: "4"
|
containerCapStr: "3"
|
||||||
connectTimeout: "20"
|
connectTimeout: "20"
|
||||||
readTimeout: "90"
|
readTimeout: "90"
|
||||||
jenkinsUrl: "http://jenkins.jenkins.svc.cluster.local:8080"
|
jenkinsUrl: "http://jenkins.jenkins.svc.cluster.local:8080"
|
||||||
|
|||||||
@ -32,6 +32,7 @@ resources:
|
|||||||
- disable-k3s-traefik-daemonset.yaml
|
- disable-k3s-traefik-daemonset.yaml
|
||||||
- oneoffs/k3s-traefik-cleanup-job.yaml
|
- oneoffs/k3s-traefik-cleanup-job.yaml
|
||||||
- node-nofile-daemonset.yaml
|
- node-nofile-daemonset.yaml
|
||||||
|
- rpi-resource-reservation-daemonset.yaml
|
||||||
- metis-sentinel-amd64-daemonset.yaml
|
- metis-sentinel-amd64-daemonset.yaml
|
||||||
- metis-sentinel-arm64-daemonset.yaml
|
- metis-sentinel-arm64-daemonset.yaml
|
||||||
- k3s-agent-restart-daemonset.yaml
|
- k3s-agent-restart-daemonset.yaml
|
||||||
@ -84,3 +85,9 @@ configMapGenerator:
|
|||||||
- node_image_sweeper.sh=scripts/node_image_sweeper.sh
|
- node_image_sweeper.sh=scripts/node_image_sweeper.sh
|
||||||
options:
|
options:
|
||||||
disableNameSuffixHash: true
|
disableNameSuffixHash: true
|
||||||
|
- name: rpi-resource-reservation-script
|
||||||
|
namespace: maintenance
|
||||||
|
files:
|
||||||
|
- rpi_resource_reservation.sh=scripts/rpi_resource_reservation.sh
|
||||||
|
options:
|
||||||
|
disableNameSuffixHash: true
|
||||||
|
|||||||
69
services/maintenance/rpi-resource-reservation-daemonset.yaml
Normal file
69
services/maintenance/rpi-resource-reservation-daemonset.yaml
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
# services/maintenance/rpi-resource-reservation-daemonset.yaml
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: rpi-resource-reservation
|
||||||
|
namespace: maintenance
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: rpi-resource-reservation
|
||||||
|
updateStrategy:
|
||||||
|
type: RollingUpdate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: rpi-resource-reservation
|
||||||
|
spec:
|
||||||
|
serviceAccountName: node-nofile
|
||||||
|
nodeSelector:
|
||||||
|
node-role.kubernetes.io/worker: "true"
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: hardware
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- rpi4
|
||||||
|
- rpi5
|
||||||
|
tolerations:
|
||||||
|
- key: node.kubernetes.io/unschedulable
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
|
- key: node.kubernetes.io/not-ready
|
||||||
|
operator: Exists
|
||||||
|
effect: NoExecute
|
||||||
|
- key: node.kubernetes.io/unreachable
|
||||||
|
operator: Exists
|
||||||
|
effect: NoExecute
|
||||||
|
containers:
|
||||||
|
- name: reservation
|
||||||
|
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
||||||
|
command: ["/usr/bin/env", "bash"]
|
||||||
|
args: ["/scripts/rpi_resource_reservation.sh"]
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 10m
|
||||||
|
memory: 32Mi
|
||||||
|
limits:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 96Mi
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
runAsUser: 0
|
||||||
|
volumeMounts:
|
||||||
|
- name: host-root
|
||||||
|
mountPath: /host
|
||||||
|
- name: script
|
||||||
|
mountPath: /scripts
|
||||||
|
readOnly: true
|
||||||
|
volumes:
|
||||||
|
- name: host-root
|
||||||
|
hostPath:
|
||||||
|
path: /
|
||||||
|
- name: script
|
||||||
|
configMap:
|
||||||
|
name: rpi-resource-reservation-script
|
||||||
|
defaultMode: 0555
|
||||||
47
services/maintenance/scripts/rpi_resource_reservation.sh
Normal file
47
services/maintenance/scripts/rpi_resource_reservation.sh
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
host_root="/host"
|
||||||
|
unit="k3s-agent"
|
||||||
|
unit_file="${host_root}/etc/systemd/system/${unit}.service"
|
||||||
|
config_dir="${host_root}/etc/rancher/k3s/config.yaml.d"
|
||||||
|
config_file="${config_dir}/90-atlas-rpi-reservations.yaml"
|
||||||
|
|
||||||
|
if [ ! -f "${unit_file}" ]; then
|
||||||
|
echo "k3s-agent unit not found; this guardrail only manages worker agents"
|
||||||
|
sleep infinity
|
||||||
|
fi
|
||||||
|
|
||||||
|
tmp_file="$(mktemp)"
|
||||||
|
cat > "${tmp_file}" <<'EOF'
|
||||||
|
# Managed by Flux via services/maintenance/scripts/rpi_resource_reservation.sh.
|
||||||
|
# Keep RPi workers below saturation so kubelet and the OS keep enough headroom
|
||||||
|
# to evict or recover before the board wedges.
|
||||||
|
kubelet-arg+:
|
||||||
|
- "system-reserved=cpu=250m,memory=384Mi,ephemeral-storage=1Gi"
|
||||||
|
- "kube-reserved=cpu=150m,memory=256Mi,ephemeral-storage=1Gi"
|
||||||
|
- "eviction-hard=memory.available<512Mi,nodefs.available<10%,imagefs.available<10%"
|
||||||
|
- "eviction-soft=memory.available<768Mi,nodefs.available<15%,imagefs.available<15%"
|
||||||
|
- "eviction-soft-grace-period=memory.available=1m,nodefs.available=2m,imagefs.available=2m"
|
||||||
|
- "eviction-max-pod-grace-period=60"
|
||||||
|
EOF
|
||||||
|
|
||||||
|
changed=0
|
||||||
|
if [ ! -f "${config_file}" ] || ! cmp -s "${tmp_file}" "${config_file}"; then
|
||||||
|
mkdir -p "${config_dir}"
|
||||||
|
install -m 0644 "${tmp_file}" "${config_file}"
|
||||||
|
changed=1
|
||||||
|
fi
|
||||||
|
rm -f "${tmp_file}"
|
||||||
|
|
||||||
|
if [ "${changed}" -eq 1 ]; then
|
||||||
|
delay="$(( (RANDOM % 420) + 30 ))"
|
||||||
|
echo "updated ${config_file}; restarting ${unit} after ${delay}s"
|
||||||
|
sleep "${delay}"
|
||||||
|
chroot "${host_root}" /bin/systemctl daemon-reload
|
||||||
|
chroot "${host_root}" /bin/systemctl restart "${unit}"
|
||||||
|
else
|
||||||
|
echo "${config_file} already up to date"
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep infinity
|
||||||
Loading…
x
Reference in New Issue
Block a user