# services/ai-llm/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: ollama namespace: ai spec: replicas: 1 revisionHistoryLimit: 2 strategy: type: RollingUpdate rollingUpdate: maxSurge: 0 maxUnavailable: 1 selector: matchLabels: app: ollama template: metadata: labels: app: ollama annotations: ai.bstein.dev/model: qwen2.5:14b-instruct-q4_0 ai.bstein.dev/gpu: GPU pool (titan-22/24) ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z" spec: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname operator: In values: - titan-22 - titan-24 runtimeClassName: nvidia volumes: - name: models persistentVolumeClaim: claimName: ollama-models initContainers: - name: warm-model image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d env: - name: OLLAMA_HOST value: 0.0.0.0 - name: NVIDIA_VISIBLE_DEVICES value: all - name: NVIDIA_DRIVER_CAPABILITIES value: compute,utility - name: OLLAMA_MODELS value: /root/.ollama - name: OLLAMA_MODEL value: qwen2.5:14b-instruct-q4_0 command: - /bin/sh - -c - | set -e ollama serve >/tmp/ollama.log 2>&1 & sleep 6 ollama pull "${OLLAMA_MODEL}" pkill ollama || true volumeMounts: - name: models mountPath: /root/.ollama resources: requests: cpu: 500m memory: 2Gi nvidia.com/gpu.shared: 1 limits: nvidia.com/gpu.shared: 1 containers: - name: ollama image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d imagePullPolicy: IfNotPresent ports: - name: http containerPort: 11434 env: - name: OLLAMA_HOST value: 0.0.0.0 - name: OLLAMA_KEEP_ALIVE value: 6h - name: OLLAMA_MODELS value: /root/.ollama - name: NVIDIA_VISIBLE_DEVICES value: all - name: NVIDIA_DRIVER_CAPABILITIES value: compute,utility volumeMounts: - name: models mountPath: /root/.ollama resources: requests: cpu: "4" memory: 16Gi nvidia.com/gpu.shared: 1 limits: cpu: "8" memory: 24Gi nvidia.com/gpu.shared: 1