titan-iac/services/ai-llm/deployment.yaml

88 lines
2.3 KiB
YAML
Raw Normal View History

# services/ai-llm/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ollama
namespace: ai
spec:
replicas: 1
revisionHistoryLimit: 2
selector:
matchLabels:
app: ollama
template:
metadata:
labels:
app: ollama
spec:
nodeSelector:
kubernetes.io/hostname: titan-24
runtimeClassName: nvidia
volumes:
- name: models
persistentVolumeClaim:
claimName: ollama-models
initContainers:
- name: warm-model
image: ollama/ollama:latest
env:
- name: OLLAMA_HOST
value: 0.0.0.0
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
- name: OLLAMA_MODELS
value: /root/.ollama
- name: OLLAMA_MODEL
value: qwen2.5-coder:7b-instruct-q4_0
command:
- /bin/sh
- -c
- |
set -e
ollama serve >/tmp/ollama.log 2>&1 &
sleep 6
ollama pull "${OLLAMA_MODEL}"
pkill ollama || true
volumeMounts:
- name: models
mountPath: /root/.ollama
resources:
requests:
cpu: 250m
memory: 1Gi
nvidia.com/gpu: 1
limits:
nvidia.com/gpu: 1
containers:
- name: ollama
image: ollama/ollama:latest
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 11434
env:
- name: OLLAMA_HOST
value: 0.0.0.0
- name: OLLAMA_KEEP_ALIVE
value: 6h
- name: OLLAMA_MODELS
value: /root/.ollama
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
volumeMounts:
- name: models
mountPath: /root/.ollama
resources:
requests:
cpu: "2"
memory: 8Gi
nvidia.com/gpu: 1
limits:
cpu: "4"
memory: 12Gi
nvidia.com/gpu: 1