titan-iac/services/ai-llm/deployment.yaml

# services/ai-llm/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: ollama
  namespace: ai
spec:
  replicas: 1
  revisionHistoryLimit: 2
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 0
      maxUnavailable: 1
  selector:
    matchLabels:
      app: ollama
  template:
    metadata:
      labels:
        app: ollama
      annotations:
        ai.bstein.dev/model: qwen2.5:7b-instruct-q4_0
        ai.bstein.dev/gpu: GPU pool (titan-22/24)
        ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"
    spec:
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
              - matchExpressions:
                  - key: kubernetes.io/hostname
                    operator: In
                    values:
                      - titan-22
                      - titan-24
      runtimeClassName: nvidia
      volumes:
        - name: models
          persistentVolumeClaim:
            claimName: ollama-models
      initContainers:
        - name: warm-model
          image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
          env:
            - name: OLLAMA_HOST
              value: 0.0.0.0
            - name: NVIDIA_VISIBLE_DEVICES
              value: all
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: compute,utility
            - name: OLLAMA_MODELS
              value: /root/.ollama
            - name: OLLAMA_MODEL
              value: qwen2.5:7b-instruct-q4_0
          command:
            - /bin/sh
            - -c
            - |
              set -e
              ollama serve >/tmp/ollama.log 2>&1 &
              sleep 6
              ollama pull "${OLLAMA_MODEL}"
              pkill ollama || true
          volumeMounts:
            - name: models
              mountPath: /root/.ollama
          resources:
            requests:
              cpu: 500m
              memory: 2Gi
              nvidia.com/gpu.shared: 1
            limits:
              nvidia.com/gpu.shared: 1
      containers:
        - name: ollama
          image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
          imagePullPolicy: IfNotPresent
          ports:
            - name: http
              containerPort: 11434
          env:
            - name: OLLAMA_HOST
              value: 0.0.0.0
            - name: OLLAMA_KEEP_ALIVE
              value: 6h
            - name: OLLAMA_MODELS
              value: /root/.ollama
            - name: NVIDIA_VISIBLE_DEVICES
              value: all
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: compute,utility
          volumeMounts:
            - name: models
              mountPath: /root/.ollama
          resources:
            requests:
              cpu: "4"
              memory: 16Gi
              nvidia.com/gpu.shared: 1
            limits:
              cpu: "8"
              memory: 24Gi
              nvidia.com/gpu.shared: 1
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`# services/ai-llm/deployment.yaml`
			`apiVersion: apps/v1`
			`kind: Deployment`
			`metadata:`
			`name: ollama`
			`namespace: ai`
			`spec:`
			`replicas: 1`
			`revisionHistoryLimit: 2`
monitoring: per-panel namespace share filters 2026-01-01 14:44:33 -03:00			`strategy:`
ai-llm: serialize rollout for RWO pvc 2026-01-01 14:48:54 -03:00			`type: RollingUpdate`
			`rollingUpdate:`
			`maxSurge: 0`
			`maxUnavailable: 1`
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`selector:`
			`matchLabels:`
			`app: ollama`
			`template:`
			`metadata:`
			`labels:`
			`app: ollama`
chore(ai-llm): annotate pod with model and gpu 2025-12-21 00:47:57 -03:00			`annotations:`
atlasbot: use cluster snapshot + model update 2026-01-27 05:41:58 -03:00			`ai.bstein.dev/model: qwen2.5:7b-instruct-q4_0`
ai-llm: tighten gpu placement and resources 2026-01-26 11:44:28 -03:00			`ai.bstein.dev/gpu: GPU pool (titan-22/24)`
			`ai.bstein.dev/restartedAt: "2026-01-26T12:00:00Z"`
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`spec:`
gpu: enable time-slicing and refresh dashboards 2026-01-01 14:16:08 -03:00			`affinity:`
			`nodeAffinity:`
			`requiredDuringSchedulingIgnoredDuringExecution:`
			`nodeSelectorTerms:`
			`- matchExpressions:`
			`- key: kubernetes.io/hostname`
			`operator: In`
			`values:`
			`- titan-22`
			`- titan-24`
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`runtimeClassName: nvidia`
			`volumes:`
			`- name: models`
			`persistentVolumeClaim:`
			`claimName: ollama-models`
			`initContainers:`
			`- name: warm-model`
vault: prep helm releases and image pins 2026-01-13 19:29:14 -03:00			`image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d`
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`env:`
			`- name: OLLAMA_HOST`
			`value: 0.0.0.0`
			`- name: NVIDIA_VISIBLE_DEVICES`
			`value: all`
			`- name: NVIDIA_DRIVER_CAPABILITIES`
			`value: compute,utility`
			`- name: OLLAMA_MODELS`
			`value: /root/.ollama`
			`- name: OLLAMA_MODEL`
atlasbot: use cluster snapshot + model update 2026-01-27 05:41:58 -03:00			`value: qwen2.5:7b-instruct-q4_0`
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`command:`
			`- /bin/sh`
			`- -c`
			`- \|`
			`set -e`
			`ollama serve >/tmp/ollama.log 2>&1 &`
			`sleep 6`
			`ollama pull "${OLLAMA_MODEL}"`
			`pkill ollama \|\| true`
			`volumeMounts:`
			`- name: models`
			`mountPath: /root/.ollama`
			`resources:`
			`requests:`
ai-llm: tighten gpu placement and resources 2026-01-26 11:44:28 -03:00			`cpu: 500m`
			`memory: 2Gi`
gpu: enable time-slicing and refresh dashboards 2026-01-01 14:16:08 -03:00			`nvidia.com/gpu.shared: 1`
ai-llm: GPU qwen2.5-coder on titan-24; add chat.ai host 2025-12-20 15:19:03 -03:00			`limits:`
gpu: enable time-slicing and refresh dashboards 2026-01-01 14:16:08 -03:00			`nvidia.com/gpu.shared: 1`
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`containers:`
			`- name: ollama`
vault: prep helm releases and image pins 2026-01-13 19:29:14 -03:00			`image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d`
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`imagePullPolicy: IfNotPresent`
			`ports:`
			`- name: http`
			`containerPort: 11434`
			`env:`
			`- name: OLLAMA_HOST`
			`value: 0.0.0.0`
			`- name: OLLAMA_KEEP_ALIVE`
			`value: 6h`
			`- name: OLLAMA_MODELS`
			`value: /root/.ollama`
			`- name: NVIDIA_VISIBLE_DEVICES`
			`value: all`
			`- name: NVIDIA_DRIVER_CAPABILITIES`
			`value: compute,utility`
			`volumeMounts:`
			`- name: models`
			`mountPath: /root/.ollama`
			`resources:`
			`requests:`
ai-llm: tighten gpu placement and resources 2026-01-26 11:44:28 -03:00			`cpu: "4"`
			`memory: 16Gi`
gpu: enable time-slicing and refresh dashboards 2026-01-01 14:16:08 -03:00			`nvidia.com/gpu.shared: 1`
ai: add ollama service and wire chat backend 2025-12-20 14:10:34 -03:00			`limits:`
ai-llm: tighten gpu placement and resources 2026-01-26 11:44:28 -03:00			`cpu: "8"`
			`memory: 24Gi`
gpu: enable time-slicing and refresh dashboards 2026-01-01 14:16:08 -03:00			`nvidia.com/gpu.shared: 1`