titan-iac/services/ai-llm/deployment.yaml

106 lines
2.9 KiB
YAML

# services/ai-llm/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ollama
namespace: ai
spec:
replicas: 1
revisionHistoryLimit: 2
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
selector:
matchLabels:
app: ollama
template:
metadata:
labels:
app: ollama
annotations:
ai.bstein.dev/model: qwen2.5-coder:7b-instruct-q4_0
ai.bstein.dev/gpu: GPU pool (titan-20/21/22/24)
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
- titan-22
- titan-24
runtimeClassName: nvidia
volumes:
- name: models
persistentVolumeClaim:
claimName: ollama-models
initContainers:
- name: warm-model
image: ollama/ollama:latest
env:
- name: OLLAMA_HOST
value: 0.0.0.0
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
- name: OLLAMA_MODELS
value: /root/.ollama
- name: OLLAMA_MODEL
value: qwen2.5-coder:7b-instruct-q4_0
command:
- /bin/sh
- -c
- |
set -e
ollama serve >/tmp/ollama.log 2>&1 &
sleep 6
ollama pull "${OLLAMA_MODEL}"
pkill ollama || true
volumeMounts:
- name: models
mountPath: /root/.ollama
resources:
requests:
cpu: 250m
memory: 1Gi
nvidia.com/gpu.shared: 1
limits:
nvidia.com/gpu.shared: 1
containers:
- name: ollama
image: ollama/ollama:latest
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 11434
env:
- name: OLLAMA_HOST
value: 0.0.0.0
- name: OLLAMA_KEEP_ALIVE
value: 6h
- name: OLLAMA_MODELS
value: /root/.ollama
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
volumeMounts:
- name: models
mountPath: /root/.ollama
resources:
requests:
cpu: "2"
memory: 8Gi
nvidia.com/gpu.shared: 1
limits:
cpu: "4"
memory: 12Gi
nvidia.com/gpu.shared: 1