ai: add ollama service and wire chat backend

2025-12-20 14:10:34 -03:00 · 2025-12-20 14:10:34 -03:00 · c8adca5a5b
commit c8adca5a5b
parent f68668f987
8 changed files with 156 additions and 0 deletions
--- a/clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml
@ -0,0 +1,23 @@
 # clusters/atlas/flux-system/applications/ai-llm/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: ai-llm
  namespace: flux-system
 spec:
  interval: 10m
  path: ./services/ai-llm
  targetNamespace: ai
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  wait: true
  healthChecks:
    - apiVersion: apps/v1
      kind: Deployment
      name: ollama
      namespace: ai
  dependsOn:
    - name: core
--- a/clusters/atlas/flux-system/applications/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/kustomization.yaml
@ -22,3 +22,4 @@ resources:
  - jenkins/kustomization.yaml
  - ci-demo/kustomization.yaml
  - ci-demo/image-automation.yaml
  - ai-llm/kustomization.yaml
--- a/services/ai-llm/deployment.yaml
+++ b/services/ai-llm/deployment.yaml
@ -0,0 +1,84 @@
 # services/ai-llm/deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: ollama
  namespace: ai
 spec:
  replicas: 1
  revisionHistoryLimit: 2
  selector:
    matchLabels:
      app: ollama
  template:
    metadata:
      labels:
        app: ollama
    spec:
      nodeSelector:
        kubernetes.io/hostname: titan-24
      runtimeClassName: nvidia
      volumes:
        - name: models
          persistentVolumeClaim:
            claimName: ollama-models
      initContainers:
        - name: warm-model
          image: ollama/ollama:latest
          env:
            - name: OLLAMA_HOST
              value: 0.0.0.0
            - name: NVIDIA_VISIBLE_DEVICES
              value: all
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: compute,utility
            - name: OLLAMA_MODELS
              value: /root/.ollama
            - name: OLLAMA_MODEL
              value: phi3:mini-4k-instruct-q4_0
          command:
            - /bin/sh
            - -c
            - |
              set -e
              ollama serve >/tmp/ollama.log 2>&1 &
              sleep 6
              ollama pull "${OLLAMA_MODEL}"
              pkill ollama || true
          volumeMounts:
            - name: models
              mountPath: /root/.ollama
          resources:
            requests:
              cpu: 250m
              memory: 1Gi
      containers:
        - name: ollama
          image: ollama/ollama:latest
          imagePullPolicy: IfNotPresent
          ports:
            - name: http
              containerPort: 11434
          env:
            - name: OLLAMA_HOST
              value: 0.0.0.0
            - name: OLLAMA_KEEP_ALIVE
              value: 6h
            - name: OLLAMA_MODELS
              value: /root/.ollama
            - name: NVIDIA_VISIBLE_DEVICES
              value: all
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: compute,utility
          volumeMounts:
            - name: models
              mountPath: /root/.ollama
          resources:
            requests:
              cpu: "2"
              memory: 8Gi
              nvidia.com/gpu: 1
            limits:
              cpu: "4"
              memory: 12Gi
              nvidia.com/gpu: 1
--- a/services/ai-llm/kustomization.yaml
+++ b/services/ai-llm/kustomization.yaml
@ -0,0 +1,9 @@
 # services/ai-llm/kustomization.yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 namespace: ai
 resources:
  - namespace.yaml
  - pvc.yaml
  - deployment.yaml
  - service.yaml
--- a/services/ai-llm/namespace.yaml
+++ b/services/ai-llm/namespace.yaml
@ -0,0 +1,5 @@
 # services/ai-llm/namespace.yaml
 apiVersion: v1
 kind: Namespace
 metadata:
  name: ai
--- a/services/ai-llm/pvc.yaml
+++ b/services/ai-llm/pvc.yaml
@ -0,0 +1,13 @@
 # services/ai-llm/pvc.yaml
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: ollama-models
  namespace: ai
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 30Gi
  storageClassName: astreae
--- a/services/ai-llm/service.yaml
+++ b/services/ai-llm/service.yaml
@ -0,0 +1,14 @@
 # services/ai-llm/service.yaml
 apiVersion: v1
 kind: Service
 metadata:
  name: ollama
  namespace: ai
 spec:
  type: ClusterIP
  selector:
    app: ollama
  ports:
    - name: http
      port: 11434
      targetPort: 11434
--- a/services/bstein-dev-home/backend-deployment.yaml
+++ b/services/bstein-dev-home/backend-deployment.yaml
@ -24,6 +24,13 @@ spec:
        - name: backend
          image: registry.bstein.dev/bstein/bstein-dev-home-backend:latest
          imagePullPolicy: Always
          env:
            - name: AI_CHAT_API
              value: http://ollama.ai.svc.cluster.local:11434
            - name: AI_CHAT_MODEL
              value: phi3:mini-4k-instruct-q4_0
            - name: AI_CHAT_TIMEOUT_SEC
              value: "20"
          ports:
            - name: http
              containerPort: 8080