# services/ai-llm/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: ollama namespace: ai spec: replicas: 1 revisionHistoryLimit: 2 selector: matchLabels: app: ollama template: metadata: labels: app: ollama spec: nodeSelector: kubernetes.io/hostname: titan-24 runtimeClassName: nvidia volumes: - name: models persistentVolumeClaim: claimName: ollama-models initContainers: - name: warm-model image: ollama/ollama:latest env: - name: OLLAMA_HOST value: 0.0.0.0 - name: NVIDIA_VISIBLE_DEVICES value: all - name: NVIDIA_DRIVER_CAPABILITIES value: compute,utility - name: OLLAMA_MODELS value: /root/.ollama - name: OLLAMA_MODEL value: qwen2.5-coder:7b-instruct-q4_0 command: - /bin/sh - -c - | set -e ollama serve >/tmp/ollama.log 2>&1 & sleep 6 ollama pull "${OLLAMA_MODEL}" pkill ollama || true volumeMounts: - name: models mountPath: /root/.ollama resources: requests: cpu: 250m memory: 1Gi nvidia.com/gpu: 1 limits: nvidia.com/gpu: 1 containers: - name: ollama image: ollama/ollama:latest imagePullPolicy: IfNotPresent ports: - name: http containerPort: 11434 env: - name: OLLAMA_HOST value: 0.0.0.0 - name: OLLAMA_KEEP_ALIVE value: 6h - name: OLLAMA_MODELS value: /root/.ollama - name: NVIDIA_VISIBLE_DEVICES value: all - name: NVIDIA_DRIVER_CAPABILITIES value: compute,utility volumeMounts: - name: models mountPath: /root/.ollama resources: requests: cpu: "2" memory: 8Gi nvidia.com/gpu: 1 limits: cpu: "4" memory: 12Gi nvidia.com/gpu: 1