openclaw: add testing triage workspace

This commit is contained in:
jenkins 2026-05-19 19:17:14 -03:00
parent b7caf4cfec
commit 1bc58e10c0
11 changed files with 614 additions and 0 deletions

View File

@ -26,6 +26,7 @@ resources:
- mailu/kustomization.yaml - mailu/kustomization.yaml
- jenkins/kustomization.yaml - jenkins/kustomization.yaml
- ai-llm/kustomization.yaml - ai-llm/kustomization.yaml
- openclaw/kustomization.yaml
- typhon/kustomization.yaml - typhon/kustomization.yaml
- nextcloud/kustomization.yaml - nextcloud/kustomization.yaml
- nextcloud-mail-sync/kustomization.yaml - nextcloud-mail-sync/kustomization.yaml

View File

@ -0,0 +1,34 @@
# clusters/atlas/flux-system/applications/openclaw/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: openclaw
namespace: flux-system
annotations:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
path: ./services/openclaw
targetNamespace: openclaw
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
wait: true
timeout: 30m
healthChecks:
- apiVersion: apps/v1
kind: Deployment
name: openclaw-ollama
namespace: openclaw
- apiVersion: apps/v1
kind: Deployment
name: openclaw
namespace: openclaw
dependsOn:
- name: cert-manager
- name: core
- name: longhorn
- name: traefik

View File

@ -0,0 +1,111 @@
# services/openclaw/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: openclaw-config
namespace: openclaw
labels:
app: openclaw
data:
openclaw.json: |
{
"agents": {
"defaults": {
"workspace": "/home/node/.openclaw/workspace",
"model": {
"primary": "ollama-cluster/qwen2.5:7b-instruct-q4_0"
},
"models": {
"ollama-cluster/qwen2.5:7b-instruct-q4_0": {}
}
},
"list": [
{
"id": "testing-triage",
"name": "Titan Testing Triage",
"workspace": "/home/node/.openclaw/workspace"
}
]
},
"gateway": {
"mode": "local",
"auth": {
"mode": "token",
"token": {
"source": "env",
"provider": "default",
"id": "OPENCLAW_GATEWAY_TOKEN"
}
},
"port": 18789,
"bind": "lan",
"controlUi": {
"enabled": true
},
"tailscale": {
"mode": "off",
"resetOnExit": false
}
},
"session": {
"dmScope": "per-channel-peer"
},
"tools": {
"profile": "coding"
},
"models": {
"mode": "merge",
"providers": {
"ollama-cluster": {
"baseUrl": "http://openclaw-ollama.openclaw.svc.cluster.local:11434/v1",
"api": "openai-completions",
"apiKey": "ollama",
"models": [
{
"id": "qwen2.5:7b-instruct-q4_0",
"name": "qwen2.5:7b-instruct-q4_0 (Titan local)",
"contextWindow": 32768,
"maxTokens": 4096,
"input": ["text"],
"cost": {
"input": 0,
"output": 0,
"cacheRead": 0,
"cacheWrite": 0
},
"reasoning": false
}
]
}
}
}
}
AGENTS.md: |
# Titan Testing Triage
You are OpenClaw running inside the Titan Kubernetes cluster as a read-only
testing and operations triage assistant.
Your job is to explain failing or suspicious test runs without mutating the
cluster. Prefer concise incident summaries with:
- affected suite, namespace, pod, build, or node
- likely root cause
- exact evidence gathered
- the smallest suggested Flux/IaC change
- commands a human can run to verify the conclusion
Useful read-only commands:
- `kubectl get nodes -o wide`
- `kubectl get pods -A -o wide`
- `kubectl get events -A --sort-by=.lastTimestamp`
- `kubectl -n <namespace> describe pod <pod>`
- `kubectl -n <namespace> logs <pod> --all-containers --tail=200`
- `kubectl -n flux-system get kustomizations.kustomize.toolkit.fluxcd.io`
- `curl -sS "$VICTORIA_METRICS_URL/api/v1/query?query=up"`
Do not run mutating commands such as `kubectl apply`, `delete`, `scale`,
`patch`, `cordon`, `uncordon`, `drain`, or `rollout restart`. Do not read
Kubernetes Secret values. Draft repo changes or operator steps instead.

View File

@ -0,0 +1,175 @@
# services/openclaw/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: openclaw
namespace: openclaw
labels:
app: openclaw
spec:
replicas: 1
revisionHistoryLimit: 2
strategy:
type: Recreate
selector:
matchLabels:
app: openclaw
template:
metadata:
labels:
app: openclaw
annotations:
ai.bstein.dev/role: testing-triage
ai.bstein.dev/placement: Jetson pool (titan-20/21)
spec:
serviceAccountName: openclaw-triage
automountServiceAccountToken: true
securityContext:
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
initContainers:
- name: init-config
image: busybox:1.37
imagePullPolicy: IfNotPresent
command:
- sh
- -c
- |
set -e
cp /config/openclaw.json /home/node/.openclaw/openclaw.json
mkdir -p /home/node/.openclaw/workspace
cp /config/AGENTS.md /home/node/.openclaw/workspace/AGENTS.md
securityContext:
runAsUser: 1000
runAsGroup: 1000
volumeMounts:
- name: home
mountPath: /home/node/.openclaw
- name: config
mountPath: /config
resources:
requests:
cpu: 25m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
- name: install-kubectl
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- |
set -e
cp "$(command -v kubectl)" /tools/kubectl
chmod 0755 /tools/kubectl
volumeMounts:
- name: tools
mountPath: /tools
resources:
requests:
cpu: 25m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
containers:
- name: gateway
image: ghcr.io/openclaw/openclaw:slim@sha256:ac2c41d7122194d32258d1ec61b33079dbc498767ecadcd50849782ad5fcb057
imagePullPolicy: IfNotPresent
command:
- node
- /app/dist/index.js
- gateway
- run
ports:
- name: gateway
containerPort: 18789
protocol: TCP
env:
- name: HOME
value: /home/node
- name: OPENCLAW_CONFIG_DIR
value: /home/node/.openclaw
- name: NODE_ENV
value: production
- name: PATH
value: /home/node/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
- name: VICTORIA_METRICS_URL
value: http://victoria-metrics-single-server.monitoring.svc.cluster.local:8428
- name: JENKINS_BASE_URL
value: http://jenkins.jenkins.svc.cluster.local:8080
- name: GITEA_BASE_URL
value: https://scm.bstein.dev
- name: GRAFANA_BASE_URL
value: https://metrics.bstein.dev
- name: OPENCLAW_GATEWAY_TOKEN
valueFrom:
secretKeyRef:
name: openclaw-secrets
key: OPENCLAW_GATEWAY_TOKEN
volumeMounts:
- name: home
mountPath: /home/node/.openclaw
- name: tmp
mountPath: /tmp
- name: tools
mountPath: /home/node/.local/bin
readinessProbe:
exec:
command:
- node
- -e
- "require('http').get('http://127.0.0.1:18789/readyz', r => process.exit(r.statusCode < 400 ? 0 : 1)).on('error', () => process.exit(1))"
initialDelaySeconds: 20
periodSeconds: 10
timeoutSeconds: 5
livenessProbe:
exec:
command:
- node
- -e
- "require('http').get('http://127.0.0.1:18789/healthz', r => process.exit(r.statusCode < 400 ? 0 : 1)).on('error', () => process.exit(1))"
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 10
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: "1"
memory: 2Gi
volumes:
- name: home
persistentVolumeClaim:
claimName: openclaw-home
- name: config
configMap:
name: openclaw-config
- name: tmp
emptyDir: {}
- name: tools
emptyDir: {}

View File

@ -0,0 +1,28 @@
# services/openclaw/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: openclaw
namespace: openclaw
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
spec:
ingressClassName: traefik
rules:
- host: openclaw.bstein.dev
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: openclaw
port:
number: 18789
tls:
- hosts:
- openclaw.bstein.dev
secretName: openclaw-tls

View File

@ -0,0 +1,14 @@
# services/openclaw/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: openclaw
resources:
- namespace.yaml
- pvc.yaml
- configmap.yaml
- rbac.yaml
- ollama-deployment.yaml
- deployment.yaml
- service.yaml
- ingress.yaml

View File

@ -0,0 +1,6 @@
# services/openclaw/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: openclaw

View File

@ -0,0 +1,113 @@
# services/openclaw/ollama-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: openclaw-ollama
namespace: openclaw
labels:
app: openclaw-ollama
spec:
replicas: 1
revisionHistoryLimit: 2
strategy:
type: Recreate
selector:
matchLabels:
app: openclaw-ollama
template:
metadata:
labels:
app: openclaw-ollama
annotations:
ai.bstein.dev/model: qwen2.5:7b-instruct-q4_0
ai.bstein.dev/gpu: Jetson pool (titan-20/21)
spec:
runtimeClassName: nvidia
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- titan-20
- titan-21
volumes:
- name: models
persistentVolumeClaim:
claimName: openclaw-ollama-models
initContainers:
- name: warm-model
image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
imagePullPolicy: IfNotPresent
env:
- name: OLLAMA_HOST
value: 0.0.0.0
- name: OLLAMA_MODELS
value: /root/.ollama
- name: OLLAMA_MODEL
value: qwen2.5:7b-instruct-q4_0
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
command:
- /bin/sh
- -c
- |
set -e
ollama serve >/tmp/ollama.log 2>&1 &
sleep 6
ollama pull "${OLLAMA_MODEL}"
pkill ollama || true
volumeMounts:
- name: models
mountPath: /root/.ollama
resources:
requests:
cpu: "1"
memory: 4Gi
nvidia.com/gpu.shared: 1
limits:
cpu: "4"
memory: 10Gi
nvidia.com/gpu.shared: 1
containers:
- name: ollama
image: ollama/ollama@sha256:2c9595c555fd70a28363489ac03bd5bf9e7c5bdf2890373c3a830ffd7252ce6d
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 11434
env:
- name: OLLAMA_HOST
value: 0.0.0.0
- name: OLLAMA_KEEP_ALIVE
value: 6h
- name: OLLAMA_MODELS
value: /root/.ollama
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
volumeMounts:
- name: models
mountPath: /root/.ollama
readinessProbe:
httpGet:
path: /api/tags
port: 11434
initialDelaySeconds: 15
periodSeconds: 10
timeoutSeconds: 5
resources:
requests:
cpu: "2"
memory: 8Gi
nvidia.com/gpu.shared: 1
limits:
cpu: "6"
memory: 12Gi
nvidia.com/gpu.shared: 1

View File

@ -0,0 +1,27 @@
# services/openclaw/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: openclaw-home
namespace: openclaw
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
storageClassName: asteria
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: openclaw-ollama-models
namespace: openclaw
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 16Gi
storageClassName: asteria

View File

@ -0,0 +1,70 @@
# services/openclaw/rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: openclaw-triage
namespace: openclaw
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: openclaw-triage-readonly
rules:
- apiGroups: [""]
resources:
- configmaps
- endpoints
- events
- namespaces
- nodes
- persistentvolumeclaims
- persistentvolumes
- pods
- pods/log
- replicationcontrollers
- services
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources:
- daemonsets
- deployments
- replicasets
- statefulsets
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources:
- cronjobs
- jobs
verbs: ["get", "list", "watch"]
- apiGroups: ["networking.k8s.io"]
resources:
- ingresses
- networkpolicies
verbs: ["get", "list", "watch"]
- apiGroups: ["helm.toolkit.fluxcd.io"]
resources:
- helmreleases
verbs: ["get", "list", "watch"]
- apiGroups: ["kustomize.toolkit.fluxcd.io"]
resources:
- kustomizations
verbs: ["get", "list", "watch"]
- apiGroups: ["source.toolkit.fluxcd.io"]
resources:
- gitrepositories
- helmrepositories
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: openclaw-triage-readonly
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: openclaw-triage-readonly
subjects:
- kind: ServiceAccount
name: openclaw-triage
namespace: openclaw

View File

@ -0,0 +1,35 @@
# services/openclaw/service.yaml
apiVersion: v1
kind: Service
metadata:
name: openclaw
namespace: openclaw
labels:
app: openclaw
spec:
type: ClusterIP
selector:
app: openclaw
ports:
- name: gateway
port: 18789
targetPort: gateway
protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
name: openclaw-ollama
namespace: openclaw
labels:
app: openclaw-ollama
spec:
type: ClusterIP
selector:
app: openclaw-ollama
ports:
- name: http
port: 11434
targetPort: http
protocol: TCP