openclaw: add testing triage workspace

2026-05-19 19:17:14 -03:00 · 2026-05-19 19:17:14 -03:00 · 1bc58e10c0
commit 1bc58e10c0
parent b7caf4cfec
11 changed files with 614 additions and 0 deletions
--- a/clusters/atlas/flux-system/applications/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/kustomization.yaml
@ -26,6 +26,7 @@ resources:
  - mailu/kustomization.yaml
  - jenkins/kustomization.yaml
  - ai-llm/kustomization.yaml
  - openclaw/kustomization.yaml
  - typhon/kustomization.yaml
  - nextcloud/kustomization.yaml
  - nextcloud-mail-sync/kustomization.yaml
--- a/clusters/atlas/flux-system/applications/openclaw/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/openclaw/kustomization.yaml
@ -0,0 +1,34 @@
 # clusters/atlas/flux-system/applications/openclaw/kustomization.yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: openclaw
  namespace: flux-system
  annotations:
    kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
 spec:
  interval: 10m
  path: ./services/openclaw
  targetNamespace: openclaw
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
    namespace: flux-system
  wait: true
  timeout: 30m
  healthChecks:
    - apiVersion: apps/v1
      kind: Deployment
      name: openclaw-ollama
      namespace: openclaw
    - apiVersion: apps/v1
      kind: Deployment
      name: openclaw
      namespace: openclaw
  dependsOn:
    - name: cert-manager
    - name: core
    - name: longhorn
    - name: traefik
--- a/services/openclaw/configmap.yaml
+++ b/services/openclaw/configmap.yaml
@ -0,0 +1,111 @@
 # services/openclaw/configmap.yaml
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: openclaw-config
  namespace: openclaw
  labels:
    app: openclaw
 data:
  openclaw.json: |
    {
      "agents": {
        "defaults": {
          "workspace": "/home/node/.openclaw/workspace",
          "model": {
            "primary": "ollama-cluster/qwen2.5:7b-instruct-q4_0"
          },
          "models": {
            "ollama-cluster/qwen2.5:7b-instruct-q4_0": {}
          }
        },
        "list": [
          {
            "id": "testing-triage",
            "name": "Titan Testing Triage",
            "workspace": "/home/node/.openclaw/workspace"
          }
        ]
      },
      "gateway": {
        "mode": "local",
        "auth": {
          "mode": "token",
          "token": {
            "source": "env",
            "provider": "default",
            "id": "OPENCLAW_GATEWAY_TOKEN"
          }
        },
        "port": 18789,
        "bind": "lan",
        "controlUi": {
          "enabled": true
        },
        "tailscale": {
          "mode": "off",
          "resetOnExit": false
        }
      },
      "session": {
        "dmScope": "per-channel-peer"
      },
      "tools": {
        "profile": "coding"
      },
      "models": {
        "mode": "merge",
        "providers": {
          "ollama-cluster": {
            "baseUrl": "http://openclaw-ollama.openclaw.svc.cluster.local:11434/v1",
            "api": "openai-completions",
            "apiKey": "ollama",
            "models": [
              {
                "id": "qwen2.5:7b-instruct-q4_0",
                "name": "qwen2.5:7b-instruct-q4_0 (Titan local)",
                "contextWindow": 32768,
                "maxTokens": 4096,
                "input": ["text"],
                "cost": {
                  "input": 0,
                  "output": 0,
                  "cacheRead": 0,
                  "cacheWrite": 0
                },
                "reasoning": false
              }
            ]
          }
        }
      }
    }
  AGENTS.md: |
    # Titan Testing Triage
    You are OpenClaw running inside the Titan Kubernetes cluster as a read-only
    testing and operations triage assistant.
    Your job is to explain failing or suspicious test runs without mutating the
    cluster. Prefer concise incident summaries with:
    - affected suite, namespace, pod, build, or node
    - likely root cause
    - exact evidence gathered
    - the smallest suggested Flux/IaC change
    - commands a human can run to verify the conclusion
    Useful read-only commands:
    - `kubectl get nodes -o wide`
    - `kubectl get pods -A -o wide`
    - `kubectl get events -A --sort-by=.lastTimestamp`
    - `kubectl -n <namespace> describe pod <pod>`
    - `kubectl -n <namespace> logs <pod> --all-containers --tail=200`
    - `kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io`
    - `curl -sS "$VICTORIA_METRICS_URL/api/v1/query?query=up"`
    Do not run mutating commands such as `kubectl apply`, `delete`, `scale`,
    `patch`, `cordon`, `uncordon`, `drain`, or `rollout restart`. Do not read
    Kubernetes Secret values. Draft repo changes or operator steps instead.
--- a/services/openclaw/deployment.yaml
+++ b/services/openclaw/deployment.yaml
@ -0,0 +1,175 @@
 # services/openclaw/deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: openclaw
  namespace: openclaw
  labels:
    app: openclaw
 spec:
  replicas: 1
  revisionHistoryLimit: 2
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: openclaw
  template:
    metadata:
      labels:
        app: openclaw
      annotations:
        ai.bstein.dev/role: testing-triage
        ai.bstein.dev/placement: Jetson pool (titan-20/21)
    spec:
      serviceAccountName: openclaw-triage
      automountServiceAccountToken: true
      securityContext:
        fsGroup: 1000
        seccompProfile:
          type: RuntimeDefault
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
              - matchExpressions:
                  - key: kubernetes.io/hostname
                    operator: In
                    values:
                      - titan-20
                      - titan-21
      initContainers:
        - name: init-config
          image: busybox:1.37
          imagePullPolicy: IfNotPresent
          command:
            - sh
            - -c
            - |
              set -e
              cp /config/openclaw.json /home/node/.openclaw/openclaw.json
              mkdir -p /home/node/.openclaw/workspace
              cp /config/AGENTS.md /home/node/.openclaw/workspace/AGENTS.md
          securityContext:
            runAsUser: 1000
            runAsGroup: 1000
          volumeMounts:
            - name: home
              mountPath: /home/node/.openclaw
            - name: config
              mountPath: /config
          resources:
            requests:
              cpu: 25m
              memory: 32Mi
            limits:
              cpu: 100m
              memory: 64Mi
        - name: install-kubectl
          image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
          imagePullPolicy: IfNotPresent
          command:
            - /bin/sh
            - -c
            - |
              set -e
              cp "$(command -v kubectl)" /tools/kubectl
              chmod 0755 /tools/kubectl
          volumeMounts:
            - name: tools
              mountPath: /tools
          resources:
            requests:
              cpu: 25m
              memory: 32Mi
            limits:
              cpu: 100m
              memory: 64Mi
      containers:
        - name: gateway
          image: ghcr.io/openclaw/openclaw:slim@sha256:ac2c41d7122194d32258d1ec61b33079dbc498767ecadcd50849782ad5fcb057
          imagePullPolicy: IfNotPresent
          command:
            - node
            - /app/dist/index.js
            - gateway
            - run
          ports:
            - name: gateway
              containerPort: 18789
              protocol: TCP
          env:
            - name: HOME
              value: /home/node
            - name: OPENCLAW_CONFIG_DIR
              value: /home/node/.openclaw
            - name: NODE_ENV
              value: production
            - name: PATH
              value: /home/node/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
            - name: VICTORIA_METRICS_URL
              value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
            - name: JENKINS_BASE_URL
              value: http://jenkins.jenkins.svc.cluster.local:8080
            - name: GITEA_BASE_URL
              value: https://scm.bstein.dev
            - name: GRAFANA_BASE_URL
              value: https://metrics.bstein.dev
            - name: OPENCLAW_GATEWAY_TOKEN
              valueFrom:
                secretKeyRef:
                  name: openclaw-secrets
                  key: OPENCLAW_GATEWAY_TOKEN
          volumeMounts:
            - name: home
              mountPath: /home/node/.openclaw
            - name: tmp
              mountPath: /tmp
            - name: tools
              mountPath: /home/node/.local/bin
          readinessProbe:
            exec:
              command:
                - node
                - -e
                - "require('http').get('http://127.0.0.1:18789/readyz', r => process.exit(r.statusCode < 400 ? 0 : 1)).on('error', () => process.exit(1))"
            initialDelaySeconds: 20
            periodSeconds: 10
            timeoutSeconds: 5
          livenessProbe:
            exec:
              command:
                - node
                - -e
                - "require('http').get('http://127.0.0.1:18789/healthz', r => process.exit(r.statusCode < 400 ? 0 : 1)).on('error', () => process.exit(1))"
            initialDelaySeconds: 60
            periodSeconds: 30
            timeoutSeconds: 10
          securityContext:
            runAsNonRoot: true
            runAsUser: 1000
            runAsGroup: 1000
            allowPrivilegeEscalation: false
            readOnlyRootFilesystem: true
            capabilities:
              drop:
                - ALL
          resources:
            requests:
              cpu: 250m
              memory: 512Mi
            limits:
              cpu: "1"
              memory: 2Gi
      volumes:
        - name: home
          persistentVolumeClaim:
            claimName: openclaw-home
        - name: config
          configMap:
            name: openclaw-config
        - name: tmp
          emptyDir: {}
        - name: tools
          emptyDir: {}
--- a/services/openclaw/ingress.yaml
+++ b/services/openclaw/ingress.yaml
@ -0,0 +1,28 @@
 # services/openclaw/ingress.yaml
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: openclaw
  namespace: openclaw
  annotations:
    cert-manager.io/cluster-issuer: letsencrypt
    traefik.ingress.kubernetes.io/router.entrypoints: websecure
    traefik.ingress.kubernetes.io/router.tls: "true"
 spec:
  ingressClassName: traefik
  rules:
    - host: openclaw.bstein.dev
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: openclaw
                port:
                  number: 18789
  tls:
    - hosts:
        - openclaw.bstein.dev
      secretName: openclaw-tls
--- a/services/openclaw/kustomization.yaml
+++ b/services/openclaw/kustomization.yaml
@ -0,0 +1,14 @@
 # services/openclaw/kustomization.yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 namespace: openclaw
 resources:
  - namespace.yaml
  - pvc.yaml
  - configmap.yaml
  - rbac.yaml
  - ollama-deployment.yaml
  - deployment.yaml
  - service.yaml
  - ingress.yaml
--- a/services/openclaw/namespace.yaml
+++ b/services/openclaw/namespace.yaml
@ -0,0 +1,6 @@
 # services/openclaw/namespace.yaml
 apiVersion: v1
 kind: Namespace
 metadata:
  name: openclaw
--- a/services/openclaw/ollama-deployment.yaml
+++ b/services/openclaw/ollama-deployment.yaml
@ -0,0 +1,113 @@
 # services/openclaw/ollama-deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: openclaw-ollama
  namespace: openclaw
  labels:
    app: openclaw-ollama
 spec:
  replicas: 1
  revisionHistoryLimit: 2
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: openclaw-ollama
  template:
    metadata:
      labels:
        app: openclaw-ollama
      annotations:
        ai.bstein.dev/model: qwen2.5:7b-instruct-q4_0
        ai.bstein.dev/gpu: Jetson pool (titan-20/21)
    spec:
      runtimeClassName: nvidia
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
              - matchExpressions:
                  - key: kubernetes.io/hostname
                    operator: In
                    values:
                      - titan-20
                      - titan-21
      volumes:
        - name: models
          persistentVolumeClaim:
            claimName: openclaw-ollama-models
      initContainers:
        - name: warm-model
          image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
          imagePullPolicy: IfNotPresent
          env:
            - name: OLLAMA_HOST
              value: 0.0.0.0
            - name: OLLAMA_MODELS
              value: /root/.ollama
            - name: OLLAMA_MODEL
              value: qwen2.5:7b-instruct-q4_0
            - name: NVIDIA_VISIBLE_DEVICES
              value: all
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: compute,utility
          command:
            - /bin/sh
            - -c
            - |
              set -e
              ollama serve >/tmp/ollama.log 2>&1 &
              sleep 6
              ollama pull "${OLLAMA_MODEL}"
              pkill ollama || true
          volumeMounts:
            - name: models
              mountPath: /root/.ollama
          resources:
            requests:
              cpu: "1"
              memory: 4Gi
              nvidia.com/gpu.shared: 1
            limits:
              cpu: "4"
              memory: 10Gi
              nvidia.com/gpu.shared: 1
      containers:
        - name: ollama
          image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
          imagePullPolicy: IfNotPresent
          ports:
            - name: http
              containerPort: 11434
          env:
            - name: OLLAMA_HOST
              value: 0.0.0.0
            - name: OLLAMA_KEEP_ALIVE
              value: 6h
            - name: OLLAMA_MODELS
              value: /root/.ollama
            - name: NVIDIA_VISIBLE_DEVICES
              value: all
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: compute,utility
          volumeMounts:
            - name: models
              mountPath: /root/.ollama
          readinessProbe:
            httpGet:
              path: /api/tags
              port: 11434
            initialDelaySeconds: 15
            periodSeconds: 10
            timeoutSeconds: 5
          resources:
            requests:
              cpu: "2"
              memory: 8Gi
              nvidia.com/gpu.shared: 1
            limits:
              cpu: "6"
              memory: 12Gi
              nvidia.com/gpu.shared: 1
--- a/services/openclaw/pvc.yaml
+++ b/services/openclaw/pvc.yaml
@ -0,0 +1,27 @@
 # services/openclaw/pvc.yaml
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: openclaw-home
  namespace: openclaw
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 10Gi
  storageClassName: asteria
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: openclaw-ollama-models
  namespace: openclaw
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 16Gi
  storageClassName: asteria
--- a/services/openclaw/rbac.yaml
+++ b/services/openclaw/rbac.yaml
@ -0,0 +1,70 @@
 # services/openclaw/rbac.yaml
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: openclaw-triage
  namespace: openclaw
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: openclaw-triage-readonly
 rules:
  - apiGroups: [""]
    resources:
      - configmaps
      - endpoints
      - events
      - namespaces
      - nodes
      - persistentvolumeclaims
      - persistentvolumes
      - pods
      - pods/log
      - replicationcontrollers
      - services
    verbs: ["get", "list", "watch"]
  - apiGroups: ["apps"]
    resources:
      - daemonsets
      - deployments
      - replicasets
      - statefulsets
    verbs: ["get", "list", "watch"]
  - apiGroups: ["batch"]
    resources:
      - cronjobs
      - jobs
    verbs: ["get", "list", "watch"]
  - apiGroups: ["networking.k8s.io"]
    resources:
      - ingresses
      - networkpolicies
    verbs: ["get", "list", "watch"]
  - apiGroups: ["helm.toolkit.fluxcd.io"]
    resources:
      - helmreleases
    verbs: ["get", "list", "watch"]
  - apiGroups: ["kustomize.toolkit.fluxcd.io"]
    resources:
      - kustomizations
    verbs: ["get", "list", "watch"]
  - apiGroups: ["source.toolkit.fluxcd.io"]
    resources:
      - gitrepositories
      - helmrepositories
    verbs: ["get", "list", "watch"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: openclaw-triage-readonly
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: openclaw-triage-readonly
 subjects:
  - kind: ServiceAccount
    name: openclaw-triage
    namespace: openclaw
--- a/services/openclaw/service.yaml
+++ b/services/openclaw/service.yaml
@ -0,0 +1,35 @@
 # services/openclaw/service.yaml
 apiVersion: v1
 kind: Service
 metadata:
  name: openclaw
  namespace: openclaw
  labels:
    app: openclaw
 spec:
  type: ClusterIP
  selector:
    app: openclaw
  ports:
    - name: gateway
      port: 18789
      targetPort: gateway
      protocol: TCP
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: openclaw-ollama
  namespace: openclaw
  labels:
    app: openclaw-ollama
 spec:
  type: ClusterIP
  selector:
    app: openclaw-ollama
  ports:
    - name: http
      port: 11434
      targetPort: http
      protocol: TCP