ops: add resource guardrails for rpi workers
This commit is contained in:
parent
c75e0d1b88
commit
a3e14ce0f2
@ -0,0 +1,21 @@
|
||||
# clusters/atlas/flux-system/platform/descheduler/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: descheduler
|
||||
namespace: flux-system
|
||||
annotations:
|
||||
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
|
||||
spec:
|
||||
interval: 30m
|
||||
path: ./infrastructure/descheduler
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
targetNamespace: kube-system
|
||||
dependsOn:
|
||||
- name: helm
|
||||
- name: core
|
||||
wait: true
|
||||
@ -4,6 +4,8 @@ kind: Kustomization
|
||||
resources:
|
||||
- core/kustomization.yaml
|
||||
- helm/kustomization.yaml
|
||||
- descheduler/kustomization.yaml
|
||||
- resource-guardrails/kustomization.yaml
|
||||
- cert-manager/kustomization.yaml
|
||||
- metallb/kustomization.yaml
|
||||
- traefik/kustomization.yaml
|
||||
|
||||
@ -0,0 +1,19 @@
|
||||
# clusters/atlas/flux-system/platform/resource-guardrails/kustomization.yaml
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: resource-guardrails
|
||||
namespace: flux-system
|
||||
annotations:
|
||||
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
|
||||
spec:
|
||||
interval: 10m
|
||||
path: ./infrastructure/resource-guardrails
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
dependsOn:
|
||||
- name: core
|
||||
wait: true
|
||||
@ -10,5 +10,6 @@ resources:
|
||||
- coredns-custom.yaml
|
||||
- coredns-deployment.yaml
|
||||
- ntp-sync-daemonset.yaml
|
||||
- workload-profiles.yaml
|
||||
- ../sources/cert-manager/letsencrypt.yaml
|
||||
- ../sources/cert-manager/letsencrypt-prod.yaml
|
||||
|
||||
27
infrastructure/core/workload-profiles.yaml
Normal file
27
infrastructure/core/workload-profiles.yaml
Normal file
@ -0,0 +1,27 @@
|
||||
# infrastructure/core/workload-profiles.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: atlas-workload-profiles
|
||||
namespace: kube-system
|
||||
data:
|
||||
profiles.yaml: |
|
||||
profiles:
|
||||
tiny:
|
||||
request: { cpu: 25m, memory: 64Mi }
|
||||
limit: { cpu: 200m, memory: 256Mi }
|
||||
light:
|
||||
request: { cpu: 50m, memory: 128Mi }
|
||||
limit: { cpu: 500m, memory: 512Mi }
|
||||
standard:
|
||||
request: { cpu: 250m, memory: 512Mi }
|
||||
limit: { cpu: "1", memory: 1Gi }
|
||||
heavy:
|
||||
request: { cpu: 500m, memory: 1Gi }
|
||||
limit: { cpu: 1500m, memory: 3Gi }
|
||||
ci:
|
||||
request: { cpu: 512m, memory: 512Mi }
|
||||
limit: { cpu: 1500m, memory: 2Gi }
|
||||
scavenger:
|
||||
request: { cpu: 10m, memory: 32Mi }
|
||||
limit: { cpu: 250m, memory: 256Mi }
|
||||
100
infrastructure/descheduler/helmrelease.yaml
Normal file
100
infrastructure/descheduler/helmrelease.yaml
Normal file
@ -0,0 +1,100 @@
|
||||
# infrastructure/descheduler/helmrelease.yaml
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: descheduler
|
||||
namespace: kube-system
|
||||
spec:
|
||||
interval: 30m
|
||||
install:
|
||||
remediation:
|
||||
retries: 3
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 3
|
||||
chart:
|
||||
spec:
|
||||
chart: descheduler
|
||||
version: 0.33.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: descheduler
|
||||
namespace: flux-system
|
||||
values:
|
||||
kind: CronJob
|
||||
schedule: "*/20 * * * *"
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 96Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
deschedulerPolicyAPIVersion: descheduler/v1alpha2
|
||||
deschedulerPolicy:
|
||||
maxNoOfPodsToEvictPerNode: 2
|
||||
maxNoOfPodsToEvictPerNamespace: 2
|
||||
profiles:
|
||||
- name: atlas-rpi-balance
|
||||
pluginConfig:
|
||||
- name: DefaultEvictor
|
||||
args:
|
||||
nodeFit: true
|
||||
minPodAge: 10m
|
||||
podProtections:
|
||||
extraEnabled:
|
||||
- PodsWithPVC
|
||||
- PodsWithLocalStorage
|
||||
- SystemCriticalPods
|
||||
- name: RemovePodsHavingTooManyRestarts
|
||||
args:
|
||||
podRestartThreshold: 12
|
||||
includingInitContainers: true
|
||||
- name: RemovePodsViolatingNodeAffinity
|
||||
args:
|
||||
nodeAffinityType:
|
||||
- requiredDuringSchedulingIgnoredDuringExecution
|
||||
- name: RemovePodsViolatingTopologySpreadConstraint
|
||||
- name: RemovePodsViolatingNodeTaints
|
||||
- name: LowNodeUtilization
|
||||
args:
|
||||
thresholds:
|
||||
cpu: 45
|
||||
memory: 45
|
||||
pods: 45
|
||||
targetThresholds:
|
||||
cpu: 75
|
||||
memory: 75
|
||||
pods: 75
|
||||
plugins:
|
||||
balance:
|
||||
enabled:
|
||||
- RemovePodsViolatingTopologySpreadConstraint
|
||||
- LowNodeUtilization
|
||||
deschedule:
|
||||
enabled:
|
||||
- RemovePodsHavingTooManyRestarts
|
||||
- RemovePodsViolatingNodeTaints
|
||||
- RemovePodsViolatingNodeAffinity
|
||||
priorityClassName: system-cluster-critical
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
preference:
|
||||
matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
- key: node-role.kubernetes.io/master
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
5
infrastructure/descheduler/kustomization.yaml
Normal file
5
infrastructure/descheduler/kustomization.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
# infrastructure/descheduler/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helmrelease.yaml
|
||||
5
infrastructure/resource-guardrails/kustomization.yaml
Normal file
5
infrastructure/resource-guardrails/kustomization.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
# infrastructure/resource-guardrails/kustomization.yaml
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- limitranges.yaml
|
||||
182
infrastructure/resource-guardrails/limitranges.yaml
Normal file
182
infrastructure/resource-guardrails/limitranges.yaml
Normal file
@ -0,0 +1,182 @@
|
||||
# infrastructure/resource-guardrails/limitranges.yaml
|
||||
apiVersion: v1
|
||||
kind: List
|
||||
items:
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: ai
|
||||
spec: &defaultCompute
|
||||
limits:
|
||||
- type: Container
|
||||
defaultRequest:
|
||||
cpu: 50m
|
||||
memory: 96Mi
|
||||
default:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: bstein-dev-home
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: cert-manager
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: climate
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: comms
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: crypto
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: finance
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: gitea
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: harbor
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: health
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: jellyfin
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: jenkins
|
||||
spec:
|
||||
limits:
|
||||
- type: Container
|
||||
defaultRequest:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
default:
|
||||
cpu: 1500m
|
||||
memory: 2Gi
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: logging
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: mailu-mailserver
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: maintenance
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: metallb-system
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: monitoring
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: nextcloud
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: outline
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: planka
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: postgres
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: quality
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: sso
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: sui-metrics
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: traefik
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: vault
|
||||
spec: *defaultCompute
|
||||
- apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: atlas-default-compute
|
||||
namespace: vaultwarden
|
||||
spec: *defaultCompute
|
||||
9
infrastructure/sources/helm/descheduler.yaml
Normal file
9
infrastructure/sources/helm/descheduler.yaml
Normal file
@ -0,0 +1,9 @@
|
||||
# infrastructure/sources/helm/descheduler.yaml
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: descheduler
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1h
|
||||
url: https://kubernetes-sigs.github.io/descheduler/
|
||||
@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ananace.yaml
|
||||
- descheduler.yaml
|
||||
- fluent-bit.yaml
|
||||
- grafana.yaml
|
||||
- hashicorp.yaml
|
||||
|
||||
@ -4,14 +4,18 @@ kind: Deployment
|
||||
metadata:
|
||||
name: monerod
|
||||
namespace: crypto
|
||||
labels: { app: monerod }
|
||||
labels:
|
||||
app: monerod
|
||||
atlas.bstein.dev/workload-profile: heavy
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy: { type: Recreate }
|
||||
selector: { matchLabels: { app: monerod } }
|
||||
template:
|
||||
metadata:
|
||||
labels: { app: monerod }
|
||||
labels:
|
||||
app: monerod
|
||||
atlas.bstein.dev/workload-profile: heavy
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 1000
|
||||
@ -41,6 +45,7 @@ spec:
|
||||
- key: kubernetes.io/hostname
|
||||
operator: NotIn
|
||||
values: ["titan-12","titan-13","titan-15","titan-17","titan-19"]
|
||||
terminationGracePeriodSeconds: 120
|
||||
containers:
|
||||
- name: monerod
|
||||
image: registry.bstein.dev/crypto/monerod:0.18.4.1
|
||||
@ -83,7 +88,13 @@ spec:
|
||||
periodSeconds: 20
|
||||
timeoutSeconds: 20
|
||||
failureThreshold: 36
|
||||
terminationGracePeriodSeconds: 120
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 1500m
|
||||
memory: 3Gi
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
|
||||
@ -4,14 +4,18 @@ kind: Deployment
|
||||
metadata:
|
||||
name: monero-p2pool
|
||||
namespace: crypto
|
||||
labels: { app: monero-p2pool }
|
||||
labels:
|
||||
app: monero-p2pool
|
||||
atlas.bstein.dev/workload-profile: light
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels: { app: monero-p2pool }
|
||||
template:
|
||||
metadata:
|
||||
labels: { app: monero-p2pool }
|
||||
labels:
|
||||
app: monero-p2pool
|
||||
atlas.bstein.dev/workload-profile: light
|
||||
annotations:
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "crypto"
|
||||
@ -87,6 +91,13 @@ spec:
|
||||
tcpSocket: { port: 3333 }
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
volumeMounts:
|
||||
- { name: p2pool-bin, mountPath: /opt/p2pool }
|
||||
volumes:
|
||||
|
||||
@ -4,14 +4,18 @@ kind: DaemonSet
|
||||
metadata:
|
||||
name: monero-xmrig
|
||||
namespace: crypto
|
||||
labels: { app: monero-xmrig }
|
||||
labels:
|
||||
app: monero-xmrig
|
||||
atlas.bstein.dev/workload-profile: scavenger
|
||||
spec:
|
||||
selector:
|
||||
matchLabels: { app: monero-xmrig }
|
||||
updateStrategy: { type: RollingUpdate }
|
||||
template:
|
||||
metadata:
|
||||
labels: { app: monero-xmrig }
|
||||
labels:
|
||||
app: monero-xmrig
|
||||
atlas.bstein.dev/workload-profile: scavenger
|
||||
spec:
|
||||
priorityClassName: scavenger
|
||||
nodeSelector:
|
||||
@ -47,3 +51,10 @@ spec:
|
||||
--donate-level N \
|
||||
--cpu-priority 1 \
|
||||
--threads "${THR}" ${EXTRA}
|
||||
resources:
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 32Mi
|
||||
limits:
|
||||
cpu: 250m
|
||||
memory: 256Mi
|
||||
|
||||
@ -6,6 +6,7 @@ metadata:
|
||||
namespace: gitea
|
||||
labels:
|
||||
app: gitea
|
||||
atlas.bstein.dev/workload-profile: heavy
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
@ -20,6 +21,7 @@ spec:
|
||||
metadata:
|
||||
labels:
|
||||
app: gitea
|
||||
atlas.bstein.dev/workload-profile: heavy
|
||||
annotations:
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/agent-init-first: "true"
|
||||
@ -197,6 +199,13 @@ spec:
|
||||
value: "true"
|
||||
- name: SSH_PORT
|
||||
value: "2242"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 1500m
|
||||
memory: 2Gi
|
||||
volumeMounts:
|
||||
- name: gitea-data
|
||||
mountPath: /data
|
||||
|
||||
@ -77,10 +77,16 @@ spec:
|
||||
internal:
|
||||
nodeSelector:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
kubernetes.io/hostname: titan-11
|
||||
image:
|
||||
repository: registry.bstein.dev/infra/harbor-redis
|
||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-redis:tag"}
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 96Mi
|
||||
limits:
|
||||
cpu: 250m
|
||||
memory: 256Mi
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
@ -114,10 +120,16 @@ spec:
|
||||
core:
|
||||
nodeSelector:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
kubernetes.io/hostname: titan-11
|
||||
image:
|
||||
repository: registry.bstein.dev/infra/harbor-core
|
||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-core:tag"}
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 750m
|
||||
memory: 1Gi
|
||||
serviceAccountName: harbor-vault-sync
|
||||
automountServiceAccountToken: true
|
||||
existingSecret: harbor-core
|
||||
@ -180,10 +192,16 @@ spec:
|
||||
jobservice:
|
||||
nodeSelector:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
kubernetes.io/hostname: titan-11
|
||||
image:
|
||||
repository: registry.bstein.dev/infra/harbor-jobservice
|
||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-jobservice:tag"}
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
serviceAccountName: harbor-vault-sync
|
||||
automountServiceAccountToken: true
|
||||
existingSecret: harbor-jobservice
|
||||
@ -227,10 +245,16 @@ spec:
|
||||
portal:
|
||||
nodeSelector:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
kubernetes.io/hostname: titan-11
|
||||
image:
|
||||
repository: registry.bstein.dev/infra/harbor-portal
|
||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-portal:tag"}
|
||||
resources:
|
||||
requests:
|
||||
cpu: 25m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 128Mi
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
@ -255,7 +279,13 @@ spec:
|
||||
registry:
|
||||
nodeSelector:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
kubernetes.io/hostname: titan-11
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 1
|
||||
memory: 1Gi
|
||||
registry:
|
||||
image:
|
||||
repository: registry.bstein.dev/infra/harbor-registry
|
||||
@ -338,10 +368,16 @@ spec:
|
||||
nginx:
|
||||
nodeSelector:
|
||||
ananke.bstein.dev/harbor-bootstrap: "true"
|
||||
kubernetes.io/hostname: titan-11
|
||||
image:
|
||||
repository: registry.bstein.dev/infra/harbor-nginx
|
||||
tag: v2.14.1-arm64 # {"$imagepolicy": "harbor:harbor-nginx:tag"}
|
||||
resources:
|
||||
requests:
|
||||
cpu: 25m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 128Mi
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
|
||||
@ -6,6 +6,7 @@ metadata:
|
||||
namespace: jellyfin
|
||||
labels:
|
||||
app: jellyfin
|
||||
atlas.bstein.dev/workload-profile: heavy
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
@ -20,6 +21,7 @@ spec:
|
||||
metadata:
|
||||
labels:
|
||||
app: jellyfin
|
||||
atlas.bstein.dev/workload-profile: heavy
|
||||
annotations:
|
||||
vault.hashicorp.com/agent-inject: "true"
|
||||
vault.hashicorp.com/role: "pegasus"
|
||||
@ -134,6 +136,9 @@ spec:
|
||||
requests:
|
||||
cpu: "500m"
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: "1500m"
|
||||
memory: 3Gi
|
||||
volumeMounts:
|
||||
- name: jellyfin-vault-entrypoint
|
||||
mountPath: /entrypoint.sh
|
||||
|
||||
@ -474,7 +474,7 @@ data:
|
||||
plainText
|
||||
clouds:
|
||||
- kubernetes:
|
||||
containerCapStr: "4"
|
||||
containerCapStr: "3"
|
||||
connectTimeout: "20"
|
||||
readTimeout: "90"
|
||||
jenkinsUrl: "http://jenkins.jenkins.svc.cluster.local:8080"
|
||||
|
||||
@ -32,6 +32,7 @@ resources:
|
||||
- disable-k3s-traefik-daemonset.yaml
|
||||
- oneoffs/k3s-traefik-cleanup-job.yaml
|
||||
- node-nofile-daemonset.yaml
|
||||
- rpi-resource-reservation-daemonset.yaml
|
||||
- metis-sentinel-amd64-daemonset.yaml
|
||||
- metis-sentinel-arm64-daemonset.yaml
|
||||
- k3s-agent-restart-daemonset.yaml
|
||||
@ -84,3 +85,9 @@ configMapGenerator:
|
||||
- node_image_sweeper.sh=scripts/node_image_sweeper.sh
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: rpi-resource-reservation-script
|
||||
namespace: maintenance
|
||||
files:
|
||||
- rpi_resource_reservation.sh=scripts/rpi_resource_reservation.sh
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
|
||||
69
services/maintenance/rpi-resource-reservation-daemonset.yaml
Normal file
69
services/maintenance/rpi-resource-reservation-daemonset.yaml
Normal file
@ -0,0 +1,69 @@
|
||||
# services/maintenance/rpi-resource-reservation-daemonset.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: rpi-resource-reservation
|
||||
namespace: maintenance
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: rpi-resource-reservation
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: rpi-resource-reservation
|
||||
spec:
|
||||
serviceAccountName: node-nofile
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi4
|
||||
- rpi5
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/unschedulable
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
- key: node.kubernetes.io/not-ready
|
||||
operator: Exists
|
||||
effect: NoExecute
|
||||
- key: node.kubernetes.io/unreachable
|
||||
operator: Exists
|
||||
effect: NoExecute
|
||||
containers:
|
||||
- name: reservation
|
||||
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
||||
command: ["/usr/bin/env", "bash"]
|
||||
args: ["/scripts/rpi_resource_reservation.sh"]
|
||||
resources:
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 32Mi
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 96Mi
|
||||
securityContext:
|
||||
privileged: true
|
||||
runAsUser: 0
|
||||
volumeMounts:
|
||||
- name: host-root
|
||||
mountPath: /host
|
||||
- name: script
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: host-root
|
||||
hostPath:
|
||||
path: /
|
||||
- name: script
|
||||
configMap:
|
||||
name: rpi-resource-reservation-script
|
||||
defaultMode: 0555
|
||||
47
services/maintenance/scripts/rpi_resource_reservation.sh
Normal file
47
services/maintenance/scripts/rpi_resource_reservation.sh
Normal file
@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
host_root="/host"
|
||||
unit="k3s-agent"
|
||||
unit_file="${host_root}/etc/systemd/system/${unit}.service"
|
||||
config_dir="${host_root}/etc/rancher/k3s/config.yaml.d"
|
||||
config_file="${config_dir}/90-atlas-rpi-reservations.yaml"
|
||||
|
||||
if [ ! -f "${unit_file}" ]; then
|
||||
echo "k3s-agent unit not found; this guardrail only manages worker agents"
|
||||
sleep infinity
|
||||
fi
|
||||
|
||||
tmp_file="$(mktemp)"
|
||||
cat > "${tmp_file}" <<'EOF'
|
||||
# Managed by Flux via services/maintenance/scripts/rpi_resource_reservation.sh.
|
||||
# Keep RPi workers below saturation so kubelet and the OS keep enough headroom
|
||||
# to evict or recover before the board wedges.
|
||||
kubelet-arg+:
|
||||
- "system-reserved=cpu=250m,memory=384Mi,ephemeral-storage=1Gi"
|
||||
- "kube-reserved=cpu=150m,memory=256Mi,ephemeral-storage=1Gi"
|
||||
- "eviction-hard=memory.available<512Mi,nodefs.available<10%,imagefs.available<10%"
|
||||
- "eviction-soft=memory.available<768Mi,nodefs.available<15%,imagefs.available<15%"
|
||||
- "eviction-soft-grace-period=memory.available=1m,nodefs.available=2m,imagefs.available=2m"
|
||||
- "eviction-max-pod-grace-period=60"
|
||||
EOF
|
||||
|
||||
changed=0
|
||||
if [ ! -f "${config_file}" ] || ! cmp -s "${tmp_file}" "${config_file}"; then
|
||||
mkdir -p "${config_dir}"
|
||||
install -m 0644 "${tmp_file}" "${config_file}"
|
||||
changed=1
|
||||
fi
|
||||
rm -f "${tmp_file}"
|
||||
|
||||
if [ "${changed}" -eq 1 ]; then
|
||||
delay="$(( (RANDOM % 420) + 30 ))"
|
||||
echo "updated ${config_file}; restarting ${unit} after ${delay}s"
|
||||
sleep "${delay}"
|
||||
chroot "${host_root}" /bin/systemctl daemon-reload
|
||||
chroot "${host_root}" /bin/systemctl restart "${unit}"
|
||||
else
|
||||
echo "${config_file} already up to date"
|
||||
fi
|
||||
|
||||
sleep infinity
|
||||
Loading…
x
Reference in New Issue
Block a user