diff --git a/clusters/atlas/flux-system/applications/kustomization.yaml b/clusters/atlas/flux-system/applications/kustomization.yaml index 1c22753c..441d50e7 100644 --- a/clusters/atlas/flux-system/applications/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/kustomization.yaml @@ -26,6 +26,7 @@ resources: - mailu/kustomization.yaml - jenkins/kustomization.yaml - ai-llm/kustomization.yaml + - openclaw/kustomization.yaml - typhon/kustomization.yaml - nextcloud/kustomization.yaml - nextcloud-mail-sync/kustomization.yaml diff --git a/clusters/atlas/flux-system/applications/openclaw/kustomization.yaml b/clusters/atlas/flux-system/applications/openclaw/kustomization.yaml new file mode 100644 index 00000000..cad29083 --- /dev/null +++ b/clusters/atlas/flux-system/applications/openclaw/kustomization.yaml @@ -0,0 +1,34 @@ +# clusters/atlas/flux-system/applications/openclaw/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: openclaw + namespace: flux-system + annotations: + kustomize.toolkit.fluxcd.io/ssa: IfNotPresent +spec: + interval: 10m + path: ./services/openclaw + targetNamespace: openclaw + prune: true + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + wait: true + timeout: 30m + healthChecks: + - apiVersion: apps/v1 + kind: Deployment + name: openclaw-ollama + namespace: openclaw + - apiVersion: apps/v1 + kind: Deployment + name: openclaw + namespace: openclaw + dependsOn: + - name: cert-manager + - name: core + - name: longhorn + - name: traefik + diff --git a/services/openclaw/configmap.yaml b/services/openclaw/configmap.yaml new file mode 100644 index 00000000..f236e1ad --- /dev/null +++ b/services/openclaw/configmap.yaml @@ -0,0 +1,111 @@ +# services/openclaw/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: openclaw-config + namespace: openclaw + labels: + app: openclaw +data: + openclaw.json: | + { + "agents": { + "defaults": { + "workspace": "/home/node/.openclaw/workspace", + "model": { + "primary": "ollama-cluster/qwen2.5:7b-instruct-q4_0" + }, + "models": { + "ollama-cluster/qwen2.5:7b-instruct-q4_0": {} + } + }, + "list": [ + { + "id": "testing-triage", + "name": "Titan Testing Triage", + "workspace": "/home/node/.openclaw/workspace" + } + ] + }, + "gateway": { + "mode": "local", + "auth": { + "mode": "token", + "token": { + "source": "env", + "provider": "default", + "id": "OPENCLAW_GATEWAY_TOKEN" + } + }, + "port": 18789, + "bind": "lan", + "controlUi": { + "enabled": true + }, + "tailscale": { + "mode": "off", + "resetOnExit": false + } + }, + "session": { + "dmScope": "per-channel-peer" + }, + "tools": { + "profile": "coding" + }, + "models": { + "mode": "merge", + "providers": { + "ollama-cluster": { + "baseUrl": "http://openclaw-ollama.openclaw.svc.cluster.local:11434/v1", + "api": "openai-completions", + "apiKey": "ollama", + "models": [ + { + "id": "qwen2.5:7b-instruct-q4_0", + "name": "qwen2.5:7b-instruct-q4_0 (Titan local)", + "contextWindow": 32768, + "maxTokens": 4096, + "input": ["text"], + "cost": { + "input": 0, + "output": 0, + "cacheRead": 0, + "cacheWrite": 0 + }, + "reasoning": false + } + ] + } + } + } + } + AGENTS.md: | + # Titan Testing Triage + + You are OpenClaw running inside the Titan Kubernetes cluster as a read-only + testing and operations triage assistant. + + Your job is to explain failing or suspicious test runs without mutating the + cluster. Prefer concise incident summaries with: + + - affected suite, namespace, pod, build, or node + - likely root cause + - exact evidence gathered + - the smallest suggested Flux/IaC change + - commands a human can run to verify the conclusion + + Useful read-only commands: + + - `kubectl get nodes -o wide` + - `kubectl get pods -A -o wide` + - `kubectl get events -A --sort-by=.lastTimestamp` + - `kubectl -n describe pod ` + - `kubectl -n logs --all-containers --tail=200` + - `kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io` + - `curl -sS "$VICTORIA_METRICS_URL/api/v1/query?query=up"` + + Do not run mutating commands such as `kubectl apply`, `delete`, `scale`, + `patch`, `cordon`, `uncordon`, `drain`, or `rollout restart`. Do not read + Kubernetes Secret values. Draft repo changes or operator steps instead. + diff --git a/services/openclaw/deployment.yaml b/services/openclaw/deployment.yaml new file mode 100644 index 00000000..0a2e1d80 --- /dev/null +++ b/services/openclaw/deployment.yaml @@ -0,0 +1,175 @@ +# services/openclaw/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: openclaw + namespace: openclaw + labels: + app: openclaw +spec: + replicas: 1 + revisionHistoryLimit: 2 + strategy: + type: Recreate + selector: + matchLabels: + app: openclaw + template: + metadata: + labels: + app: openclaw + annotations: + ai.bstein.dev/role: testing-triage + ai.bstein.dev/placement: Jetson pool (titan-20/21) + spec: + serviceAccountName: openclaw-triage + automountServiceAccountToken: true + securityContext: + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - titan-20 + - titan-21 + initContainers: + - name: init-config + image: busybox:1.37 + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + set -e + cp /config/openclaw.json /home/node/.openclaw/openclaw.json + mkdir -p /home/node/.openclaw/workspace + cp /config/AGENTS.md /home/node/.openclaw/workspace/AGENTS.md + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + volumeMounts: + - name: home + mountPath: /home/node/.openclaw + - name: config + mountPath: /config + resources: + requests: + cpu: 25m + memory: 32Mi + limits: + cpu: 100m + memory: 64Mi + - name: install-kubectl + image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 + imagePullPolicy: IfNotPresent + command: + - /bin/sh + - -c + - | + set -e + cp "$(command -v kubectl)" /tools/kubectl + chmod 0755 /tools/kubectl + volumeMounts: + - name: tools + mountPath: /tools + resources: + requests: + cpu: 25m + memory: 32Mi + limits: + cpu: 100m + memory: 64Mi + containers: + - name: gateway + image: ghcr.io/openclaw/openclaw:slim@sha256:ac2c41d7122194d32258d1ec61b33079dbc498767ecadcd50849782ad5fcb057 + imagePullPolicy: IfNotPresent + command: + - node + - /app/dist/index.js + - gateway + - run + ports: + - name: gateway + containerPort: 18789 + protocol: TCP + env: + - name: HOME + value: /home/node + - name: OPENCLAW_CONFIG_DIR + value: /home/node/.openclaw + - name: NODE_ENV + value: production + - name: PATH + value: /home/node/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + - name: VICTORIA_METRICS_URL + value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428 + - name: JENKINS_BASE_URL + value: http://jenkins.jenkins.svc.cluster.local:8080 + - name: GITEA_BASE_URL + value: https://scm.bstein.dev + - name: GRAFANA_BASE_URL + value: https://metrics.bstein.dev + - name: OPENCLAW_GATEWAY_TOKEN + valueFrom: + secretKeyRef: + name: openclaw-secrets + key: OPENCLAW_GATEWAY_TOKEN + volumeMounts: + - name: home + mountPath: /home/node/.openclaw + - name: tmp + mountPath: /tmp + - name: tools + mountPath: /home/node/.local/bin + readinessProbe: + exec: + command: + - node + - -e + - "require('http').get('http://127.0.0.1:18789/readyz', r => process.exit(r.statusCode < 400 ? 0 : 1)).on('error', () => process.exit(1))" + initialDelaySeconds: 20 + periodSeconds: 10 + timeoutSeconds: 5 + livenessProbe: + exec: + command: + - node + - -e + - "require('http').get('http://127.0.0.1:18789/healthz', r => process.exit(r.statusCode < 400 ? 0 : 1)).on('error', () => process.exit(1))" + initialDelaySeconds: 60 + periodSeconds: 30 + timeoutSeconds: 10 + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: "1" + memory: 2Gi + volumes: + - name: home + persistentVolumeClaim: + claimName: openclaw-home + - name: config + configMap: + name: openclaw-config + - name: tmp + emptyDir: {} + - name: tools + emptyDir: {} + diff --git a/services/openclaw/ingress.yaml b/services/openclaw/ingress.yaml new file mode 100644 index 00000000..85178ad1 --- /dev/null +++ b/services/openclaw/ingress.yaml @@ -0,0 +1,28 @@ +# services/openclaw/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: openclaw + namespace: openclaw + annotations: + cert-manager.io/cluster-issuer: letsencrypt + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" +spec: + ingressClassName: traefik + rules: + - host: openclaw.bstein.dev + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: openclaw + port: + number: 18789 + tls: + - hosts: + - openclaw.bstein.dev + secretName: openclaw-tls + diff --git a/services/openclaw/kustomization.yaml b/services/openclaw/kustomization.yaml new file mode 100644 index 00000000..70b3aa73 --- /dev/null +++ b/services/openclaw/kustomization.yaml @@ -0,0 +1,14 @@ +# services/openclaw/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: openclaw +resources: + - namespace.yaml + - pvc.yaml + - configmap.yaml + - rbac.yaml + - ollama-deployment.yaml + - deployment.yaml + - service.yaml + - ingress.yaml + diff --git a/services/openclaw/namespace.yaml b/services/openclaw/namespace.yaml new file mode 100644 index 00000000..57fe3787 --- /dev/null +++ b/services/openclaw/namespace.yaml @@ -0,0 +1,6 @@ +# services/openclaw/namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: openclaw + diff --git a/services/openclaw/ollama-deployment.yaml b/services/openclaw/ollama-deployment.yaml new file mode 100644 index 00000000..bc246046 --- /dev/null +++ b/services/openclaw/ollama-deployment.yaml @@ -0,0 +1,113 @@ +# services/openclaw/ollama-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: openclaw-ollama + namespace: openclaw + labels: + app: openclaw-ollama +spec: + replicas: 1 + revisionHistoryLimit: 2 + strategy: + type: Recreate + selector: + matchLabels: + app: openclaw-ollama + template: + metadata: + labels: + app: openclaw-ollama + annotations: + ai.bstein.dev/model: qwen2.5:7b-instruct-q4_0 + ai.bstein.dev/gpu: Jetson pool (titan-20/21) + spec: + runtimeClassName: nvidia + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - titan-20 + - titan-21 + volumes: + - name: models + persistentVolumeClaim: + claimName: openclaw-ollama-models + initContainers: + - name: warm-model + image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d + imagePullPolicy: IfNotPresent + env: + - name: OLLAMA_HOST + value: 0.0.0.0 + - name: OLLAMA_MODELS + value: /root/.ollama + - name: OLLAMA_MODEL + value: qwen2.5:7b-instruct-q4_0 + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + command: + - /bin/sh + - -c + - | + set -e + ollama serve >/tmp/ollama.log 2>&1 & + sleep 6 + ollama pull "${OLLAMA_MODEL}" + pkill ollama || true + volumeMounts: + - name: models + mountPath: /root/.ollama + resources: + requests: + cpu: "1" + memory: 4Gi + nvidia.com/gpu.shared: 1 + limits: + cpu: "4" + memory: 10Gi + nvidia.com/gpu.shared: 1 + containers: + - name: ollama + image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 11434 + env: + - name: OLLAMA_HOST + value: 0.0.0.0 + - name: OLLAMA_KEEP_ALIVE + value: 6h + - name: OLLAMA_MODELS + value: /root/.ollama + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + volumeMounts: + - name: models + mountPath: /root/.ollama + readinessProbe: + httpGet: + path: /api/tags + port: 11434 + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 5 + resources: + requests: + cpu: "2" + memory: 8Gi + nvidia.com/gpu.shared: 1 + limits: + cpu: "6" + memory: 12Gi + nvidia.com/gpu.shared: 1 + diff --git a/services/openclaw/pvc.yaml b/services/openclaw/pvc.yaml new file mode 100644 index 00000000..9f22ce50 --- /dev/null +++ b/services/openclaw/pvc.yaml @@ -0,0 +1,27 @@ +# services/openclaw/pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: openclaw-home + namespace: openclaw +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + storageClassName: asteria +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: openclaw-ollama-models + namespace: openclaw +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 16Gi + storageClassName: asteria + diff --git a/services/openclaw/rbac.yaml b/services/openclaw/rbac.yaml new file mode 100644 index 00000000..4e136bc0 --- /dev/null +++ b/services/openclaw/rbac.yaml @@ -0,0 +1,70 @@ +# services/openclaw/rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: openclaw-triage + namespace: openclaw +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: openclaw-triage-readonly +rules: + - apiGroups: [""] + resources: + - configmaps + - endpoints + - events + - namespaces + - nodes + - persistentvolumeclaims + - persistentvolumes + - pods + - pods/log + - replicationcontrollers + - services + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: + - daemonsets + - deployments + - replicasets + - statefulsets + verbs: ["get", "list", "watch"] + - apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: ["get", "list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: + - ingresses + - networkpolicies + verbs: ["get", "list", "watch"] + - apiGroups: ["helm.toolkit.fluxcd.io"] + resources: + - helmreleases + verbs: ["get", "list", "watch"] + - apiGroups: ["kustomize.toolkit.fluxcd.io"] + resources: + - kustomizations + verbs: ["get", "list", "watch"] + - apiGroups: ["source.toolkit.fluxcd.io"] + resources: + - gitrepositories + - helmrepositories + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: openclaw-triage-readonly +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: openclaw-triage-readonly +subjects: + - kind: ServiceAccount + name: openclaw-triage + namespace: openclaw + diff --git a/services/openclaw/service.yaml b/services/openclaw/service.yaml new file mode 100644 index 00000000..784245ad --- /dev/null +++ b/services/openclaw/service.yaml @@ -0,0 +1,35 @@ +# services/openclaw/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: openclaw + namespace: openclaw + labels: + app: openclaw +spec: + type: ClusterIP + selector: + app: openclaw + ports: + - name: gateway + port: 18789 + targetPort: gateway + protocol: TCP +--- +apiVersion: v1 +kind: Service +metadata: + name: openclaw-ollama + namespace: openclaw + labels: + app: openclaw-ollama +spec: + type: ClusterIP + selector: + app: openclaw-ollama + ports: + - name: http + port: 11434 + targetPort: http + protocol: TCP +