titan-iac/services/ai-llm/deployment.yaml

# services/ai-llm/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: ollama
  namespace: ai
spec:
  replicas: 1
  revisionHistoryLimit: 2
  selector:
    matchLabels:
      app: ollama
  template:
    metadata:
      labels:
        app: ollama
    spec:
      nodeSelector:
        kubernetes.io/hostname: titan-24
      runtimeClassName: nvidia
      volumes:
        - name: models
          persistentVolumeClaim:
            claimName: ollama-models
      initContainers:
        - name: warm-model
          image: ollama/ollama:latest
          env:
            - name: OLLAMA_HOST
              value: 0.0.0.0
            - name: NVIDIA_VISIBLE_DEVICES
              value: all
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: compute,utility
            - name: OLLAMA_MODELS
              value: /root/.ollama
            - name: OLLAMA_MODEL
              value: qwen2.5-coder:7b-instruct-q4_0
          command:
            - /bin/sh
            - -c
            - |
              set -e
              ollama serve >/tmp/ollama.log 2>&1 &
              sleep 6
              ollama pull "${OLLAMA_MODEL}"
              pkill ollama || true
          volumeMounts:
            - name: models
              mountPath: /root/.ollama
          resources:
            requests:
              cpu: 250m
              memory: 1Gi
              nvidia.com/gpu: 1
            limits:
              nvidia.com/gpu: 1
      containers:
        - name: ollama
          image: ollama/ollama:latest
          imagePullPolicy: IfNotPresent
          ports:
            - name: http
              containerPort: 11434
          env:
            - name: OLLAMA_HOST
              value: 0.0.0.0
            - name: OLLAMA_KEEP_ALIVE
              value: 6h
            - name: OLLAMA_MODELS
              value: /root/.ollama
            - name: NVIDIA_VISIBLE_DEVICES
              value: all
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: compute,utility
          volumeMounts:
            - name: models
              mountPath: /root/.ollama
          resources:
            requests:
              cpu: "2"
              memory: 8Gi
              nvidia.com/gpu: 1
            limits:
              cpu: "4"
              memory: 12Gi
              nvidia.com/gpu: 1
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`# services/ai-llm/deployment.yaml`
			`apiVersion: apps/v1`
			`kind: Deployment`
			`metadata:`
			`name: ollama`
			`namespace: ai`
			`spec:`
			`replicas: 1`
			`revisionHistoryLimit: 2`
			`selector:`
			`matchLabels:`
			`app: ollama`
			`template:`
			`metadata:`
			`labels:`
			`app: ollama`
			`spec:`
			`nodeSelector:`
			`kubernetes.io/hostname: titan-24`
			`runtimeClassName: nvidia`
			`volumes:`
			`- name: models`
			`persistentVolumeClaim:`
			`claimName: ollama-models`
			`initContainers:`
			`- name: warm-model`
			`image: ollama/ollama:latest`
			`env:`
			`- name: OLLAMA_HOST`
			`value: 0.0.0.0`
			`- name: NVIDIA_VISIBLE_DEVICES`
			`value: all`
			`- name: NVIDIA_DRIVER_CAPABILITIES`
			`value: compute,utility`
			`- name: OLLAMA_MODELS`
			`value: /root/.ollama`
			`- name: OLLAMA_MODEL`
ai-llm: GPU qwen2.5-coder on titan-24; add chat.ai host 2025-12-20 15:19:03 -03:00			`value: qwen2.5-coder:7b-instruct-q4_0`
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`command:`
			`- /bin/sh`
			`- -c`
			`- \|`
			`set -e`
			`ollama serve >/tmp/ollama.log 2>&1 &`
			`sleep 6`
			`ollama pull "${OLLAMA_MODEL}"`
			`pkill ollama \|\| true`
			`volumeMounts:`
			`- name: models`
			`mountPath: /root/.ollama`
			`resources:`
			`requests:`
			`cpu: 250m`
			`memory: 1Gi`
ai-llm: GPU qwen2.5-coder on titan-24; add chat.ai host 2025-12-20 15:19:03 -03:00			`nvidia.com/gpu: 1`
			`limits:`
			`nvidia.com/gpu: 1`
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`containers:`
			`- name: ollama`
			`image: ollama/ollama:latest`
			`imagePullPolicy: IfNotPresent`
			`ports:`
			`- name: http`
			`containerPort: 11434`
			`env:`
			`- name: OLLAMA_HOST`
			`value: 0.0.0.0`
			`- name: OLLAMA_KEEP_ALIVE`
			`value: 6h`
			`- name: OLLAMA_MODELS`
			`value: /root/.ollama`
			`- name: NVIDIA_VISIBLE_DEVICES`
			`value: all`
			`- name: NVIDIA_DRIVER_CAPABILITIES`
			`value: compute,utility`
			`volumeMounts:`
			`- name: models`
			`mountPath: /root/.ollama`
			`resources:`
			`requests:`
			`cpu: "2"`
			`memory: 8Gi`
ai-llm: GPU qwen2.5-coder on titan-24; add chat.ai host 2025-12-20 15:19:03 -03:00			`nvidia.com/gpu: 1`
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`limits:`
			`cpu: "4"`
			`memory: 12Gi`
ai-llm: GPU qwen2.5-coder on titan-24; add chat.ai host 2025-12-20 15:19:03 -03:00			`nvidia.com/gpu: 1`