veles: stage atlas infrastructure

2026-06-09 00:46:46 -03:00 · 2026-06-09 00:46:46 -03:00 · 654900b8a2
commit 654900b8a2
parent e1d091eb14
41 changed files with 1562 additions and 14 deletions
--- a/clusters/atlas/flux-system/applications/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/kustomization.yaml
@ -28,6 +28,7 @@ resources:
  - ai-llm/kustomization.yaml
  - openclaw/kustomization.yaml
  - game-stream/kustomization.yaml
+  - veles/kustomization.yaml
  - typhon/kustomization.yaml
  - nextcloud/kustomization.yaml
  - nextcloud-mail-sync/kustomization.yaml
--- a/clusters/atlas/flux-system/applications/veles/image-automation.yaml
+++ b/clusters/atlas/flux-system/applications/veles/image-automation.yaml
@ -0,0 +1,29 @@
+# clusters/atlas/flux-system/applications/veles/image-automation.yaml
+# Staged for the first Veles image rollout. Add this file to the parent
+# applications kustomization after the namespace exists and the Harbor repos
+# have initial tags.
+apiVersion: image.toolkit.fluxcd.io/v1
+kind: ImageUpdateAutomation
+metadata:
+  name: veles
+  namespace: veles
+spec:
+  interval: 1m0s
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  git:
+    checkout:
+      ref:
+        branch: main
+    commit:
+      author:
+        email: ops@bstein.dev
+        name: flux-bot
+      messageTemplate: "chore(veles): automated image update"
+    push:
+      branch: main
+  update:
+    strategy: Setters
+    path: services/veles
--- a/clusters/atlas/flux-system/applications/veles/kustomization.yaml
+++ b/clusters/atlas/flux-system/applications/veles/kustomization.yaml
@ -0,0 +1,28 @@
+# clusters/atlas/flux-system/applications/veles/kustomization.yaml
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: veles
+  namespace: flux-system
+  annotations:
+    kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
+spec:
+  interval: 10m
+  path: ./services/veles
+  targetNamespace: veles
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+    namespace: flux-system
+  dependsOn:
+    - name: cert-manager
+    - name: core
+    - name: keycloak
+    - name: longhorn
+    - name: traefik
+    - name: vault
+    - name: vault-csi
+    - name: vault-injector
+  wait: false
+  timeout: 20m
--- a/infrastructure/core/node-prefer-noschedule-cronjob.yaml
+++ b/infrastructure/core/node-prefer-noschedule-cronjob.yaml
@ -55,6 +55,20 @@ spec:
                    k label node titan-22 atlas.bstein.dev/general-compute=last-resort --overwrite=true || true
                  fi

+                  if k get node titan-23 >/dev/null 2>&1; then
+                    k label node titan-23 \
+                      veles.bstein.dev/simulation=true \
+                      veles.bstein.dev/node-pool=oceanus \
+                      node-role.kubernetes.io/veles-sim=true \
+                      longhorn-host=true \
+                      hardware=oceanus \
+                      --overwrite=true || true
+                    k label node titan-23 node-role.kubernetes.io/worker- || true
+                    k taint node titan-23 veles.bstein.dev/simulation=true:NoSchedule --overwrite=true || true
+                  else
+                    echo "skipping missing node titan-23"
+                  fi
+
                  for node in titan-13 titan-15 titan-17 titan-19; do
                    if k get node "${node}" >/dev/null 2>&1; then
                      k label node "${node}" atlas.bstein.dev/spillover=true longhorn-host=true --overwrite=true || true
--- a/infrastructure/longhorn/core/helmrelease.yaml
+++ b/infrastructure/longhorn/core/helmrelease.yaml
@ -81,7 +81,13 @@ spec:
          tag: v2.16.0
    defaultSettings:
      systemManagedPodsImagePullPolicy: Always
+      taintToleration: veles.bstein.dev/simulation=true:NoSchedule
    longhornManager:
+      tolerations:
+        - key: veles.bstein.dev/simulation
+          operator: Equal
+          value: "true"
+          effect: NoSchedule
      nodeSelector:
        longhorn-host: "true"
    longhornDriver:
--- a/infrastructure/longhorn/core/kustomization.yaml
+++ b/infrastructure/longhorn/core/kustomization.yaml
@ -7,6 +7,7 @@ resources:
  - secretproviderclass.yaml
  - vault-sync-deployment.yaml
  - helmrelease.yaml
+  - veles-recurring-jobs.yaml
  - longhorn-settings-ensure-job.yaml
  - longhorn-disk-tags-ensure-job.yaml

--- a/infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml
+++ b/infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml
@ -2,7 +2,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: longhorn-disk-tags-ensure-1
+  name: longhorn-disk-tags-ensure-3
  namespace: longhorn-system
 spec:
  backoffLimit: 0
--- a/infrastructure/longhorn/core/scripts/longhorn_disk_tags_ensure.py
+++ b/infrastructure/longhorn/core/scripts/longhorn_disk_tags_ensure.py
@ -17,10 +17,28 @@ import urllib.request

 LONGHORN_NS = "longhorn-system"
 LONGHORN_API = "/apis/longhorn.io/v1beta2/namespaces/{namespace}/nodes"
-DESIRED_TAGS = {
-    "/mnt/astreae": "astreae",
-    "/mnt/asteria": "asteria",
+DESIRED_DISK_TAGS = {
+    "/mnt/astreae": ["astreae"],
+    "/mnt/asteria": ["asteria"],
+    "/mnt/veles": ["veles-oceanus", "veles-db", "veles-artifacts"],
+    "/mnt/veles-db": ["veles-oceanus", "veles-db"],
+    "/mnt/veles-artifacts": ["veles-oceanus", "veles-artifacts"],
 }
+DESIRED_NODE_TAGS = {
+    "titan-23": ["veles-oceanus"],
+}
+DESIRED_NODE_DISKS = {
+    "titan-23": {
+        "veles-oceanus": {
+            "path": "/mnt/veles",
+            "allowScheduling": True,
+            "evictionRequested": False,
+            "storageReserved": 0,
+            "tags": ["veles-oceanus", "veles-db", "veles-artifacts"],
+        }
+    }
+}
+DISABLE_DEFAULT_DISK_NODES = {"titan-23"}


 def api_base() -> str:
@ -63,8 +81,30 @@ def list_nodes() -> list[dict]:
    return data.get("items", [])


-def patch_disk_tags(node_name: str, disk_name: str, desired_tag: str) -> None:
-    body = {"spec": {"disks": {disk_name: {"tags": [desired_tag]}}}}
+def merged_tags(current_tags: list[str], desired_tags: list[str]) -> list[str]:
+    return sorted(dict.fromkeys([*current_tags, *desired_tags]))
+
+
+def patch_node_tags(node_name: str, desired_tags: list[str]) -> None:
+    body = {"spec": {"tags": desired_tags}}
+    request_json(
+        "PATCH",
+        f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}",
+        body=body,
+    )
+
+
+def patch_disk_tags(node_name: str, disk_name: str, desired_tags: list[str]) -> None:
+    body = {"spec": {"disks": {disk_name: {"tags": desired_tags}}}}
+    request_json(
+        "PATCH",
+        f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}",
+        body=body,
+    )
+
+
+def patch_disks(node_name: str, disks: dict) -> None:
+    body = {"spec": {"disks": disks}}
    request_json(
        "PATCH",
        f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}",
@ -78,18 +118,52 @@ def main() -> int:

    for node in list_nodes():
        name = node.get("metadata", {}).get("name", "")
+        desired_node_tags = DESIRED_NODE_TAGS.get(name)
+        if desired_node_tags:
+            current_node_tags = node.get("spec", {}).get("tags") or []
+            next_node_tags = merged_tags(current_node_tags, desired_node_tags)
+            if current_node_tags != next_node_tags:
+                print(f"patching {name} node tags={current_node_tags!r} -> {next_node_tags!r}")
+                patch_node_tags(name, next_node_tags)
+                changed += 1
+            else:
+                skipped += 1
+
        spec_disks = node.get("spec", {}).get("disks", {}) or {}
+        desired_disks = DESIRED_NODE_DISKS.get(name, {})
+        missing_disks = {
+            disk_name: disk_spec
+            for disk_name, disk_spec in desired_disks.items()
+            if disk_name not in spec_disks
+        }
+        if missing_disks:
+            print(f"adding {name} disks={sorted(missing_disks)}")
+            patch_disks(name, missing_disks)
+            changed += len(missing_disks)
+            spec_disks = {**spec_disks, **missing_disks}
+
+        if name in DISABLE_DEFAULT_DISK_NODES:
+            disable_patch = {}
+            for disk_name, disk in spec_disks.items():
+                disk_path = (disk.get("path") or "").rstrip("/")
+                if disk_path == "/var/lib/longhorn" and disk.get("allowScheduling", True):
+                    disable_patch[disk_name] = {"allowScheduling": False}
+            if disable_patch:
+                print(f"disabling default Longhorn scheduling on {name} disks={sorted(disable_patch)}")
+                patch_disks(name, disable_patch)
+                changed += len(disable_patch)
+
        for disk_name, disk in spec_disks.items():
            disk_path = disk.get("path")
-            desired_tag = DESIRED_TAGS.get(disk_path)
-            if not desired_tag:
+            desired_disk_tags = DESIRED_DISK_TAGS.get(disk_path)
+            if not desired_disk_tags:
                continue
            current_tags = disk.get("tags") or []
-            if current_tags == [desired_tag]:
+            if current_tags == desired_disk_tags:
                skipped += 1
                continue
-            print(f"patching {name}:{disk_name} path={disk_path} tags={current_tags!r} -> {[desired_tag]!r}")
-            patch_disk_tags(name, disk_name, desired_tag)
+            print(f"patching {name}:{disk_name} path={disk_path} tags={current_tags!r} -> {desired_disk_tags!r}")
+            patch_disk_tags(name, disk_name, desired_disk_tags)
            changed += 1

    print(f"done: changed={changed} skipped={skipped}")
--- a/infrastructure/longhorn/core/veles-recurring-jobs.yaml
+++ b/infrastructure/longhorn/core/veles-recurring-jobs.yaml
@ -0,0 +1,28 @@
+# infrastructure/longhorn/core/veles-recurring-jobs.yaml
+apiVersion: longhorn.io/v1beta2
+kind: RecurringJob
+metadata:
+  name: veles-postgres-backup
+  namespace: longhorn-system
+spec:
+  cron: "30 5 * * *"
+  task: backup
+  groups:
+    - veles
+    - veles-postgres
+  retain: 7
+  concurrency: 1
+---
+apiVersion: longhorn.io/v1beta2
+kind: RecurringJob
+metadata:
+  name: veles-postgres-snapshot
+  namespace: longhorn-system
+spec:
+  cron: "*/30 * * * *"
+  task: snapshot
+  groups:
+    - veles
+    - veles-postgres
+  retain: 8
+  concurrency: 1
--- a/infrastructure/modules/base/priorityclass/kustomization.yaml
+++ b/infrastructure/modules/base/priorityclass/kustomization.yaml
@ -3,3 +3,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - scavenger.yaml
+  - veles.yaml
--- a/infrastructure/modules/base/priorityclass/veles.yaml
+++ b/infrastructure/modules/base/priorityclass/veles.yaml
@ -0,0 +1,17 @@
+# infrastructure/modules/base/priorityclass/veles.yaml
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: veles-core
+value: 500
+globalDefault: false
+description: "For Veles core database, API, and controller workloads"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: veles-sim
+value: 50
+globalDefault: false
+preemptionPolicy: Never
+description: "For Veles simulation jobs; lower than core and non-preempting"
--- a/infrastructure/modules/base/storageclass/kustomization.yaml
+++ b/infrastructure/modules/base/storageclass/kustomization.yaml
@ -5,3 +5,6 @@ resources:
  - asteria.yaml
  - asteria-encrypted.yaml
  - astreae.yaml
+  - veles-oceanus-db.yaml
+  - veles-oceanus-artifacts.yaml
+  - veles-oceanus-policy.yaml
--- a/infrastructure/modules/base/storageclass/veles-oceanus-artifacts.yaml
+++ b/infrastructure/modules/base/storageclass/veles-oceanus-artifacts.yaml
@ -0,0 +1,20 @@
+# infrastructure/modules/base/storageclass/veles-oceanus-artifacts.yaml
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: veles-oceanus-artifacts
+  annotations:
+    veles.bstein.dev/allowed-namespace: veles
+provisioner: driver.longhorn.io
+parameters:
+  nodeSelector: veles-oceanus
+  diskSelector: veles-oceanus,veles-artifacts
+  fromBackup: ""
+  numberOfReplicas: "1"
+  staleReplicaTimeout: "30"
+  fsType: ext4
+  replicaAutoBalance: disabled
+  dataLocality: strict-local
+reclaimPolicy: Retain
+allowVolumeExpansion: true
+volumeBindingMode: WaitForFirstConsumer
--- a/infrastructure/modules/base/storageclass/veles-oceanus-db.yaml
+++ b/infrastructure/modules/base/storageclass/veles-oceanus-db.yaml
@ -0,0 +1,21 @@
+# infrastructure/modules/base/storageclass/veles-oceanus-db.yaml
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: veles-oceanus-db
+  annotations:
+    veles.bstein.dev/allowed-namespace: veles
+provisioner: driver.longhorn.io
+parameters:
+  nodeSelector: veles-oceanus
+  diskSelector: veles-oceanus,veles-db
+  fromBackup: ""
+  numberOfReplicas: "1"
+  staleReplicaTimeout: "30"
+  fsType: ext4
+  replicaAutoBalance: disabled
+  dataLocality: strict-local
+  recurringJobSelector: '[{"name":"veles-postgres-backup","isGroup":false},{"name":"veles-postgres-snapshot","isGroup":false}]'
+reclaimPolicy: Retain
+allowVolumeExpansion: true
+volumeBindingMode: WaitForFirstConsumer
--- a/infrastructure/modules/base/storageclass/veles-oceanus-policy.yaml
+++ b/infrastructure/modules/base/storageclass/veles-oceanus-policy.yaml
@ -0,0 +1,25 @@
+# infrastructure/modules/base/storageclass/veles-oceanus-policy.yaml
+apiVersion: admissionregistration.k8s.io/v1
+kind: ValidatingAdmissionPolicy
+metadata:
+  name: veles-oceanus-storage-namespace
+spec:
+  failurePolicy: Fail
+  matchConstraints:
+    resourceRules:
+      - apiGroups: [""]
+        apiVersions: ["v1"]
+        operations: ["CREATE", "UPDATE"]
+        resources: ["persistentvolumeclaims"]
+  validations:
+    - expression: "!has(object.spec.storageClassName) || !(object.spec.storageClassName in ['veles-oceanus-db', 'veles-oceanus-artifacts']) || object.metadata.namespace == 'veles'"
+      message: "Veles Oceanus storage classes are reserved for namespace veles"
+---
+apiVersion: admissionregistration.k8s.io/v1
+kind: ValidatingAdmissionPolicyBinding
+metadata:
+  name: veles-oceanus-storage-namespace
+spec:
+  policyName: veles-oceanus-storage-namespace
+  validationActions:
+    - Deny
--- a/services/jenkins/configmap-jcasc.yaml
+++ b/services/jenkins/configmap-jcasc.yaml
@ -429,6 +429,24 @@ data:
              }
            }
          }
+          pipelineJob('veles') {
+            disabled(true)
+            description('Staged Veles alpha image pipeline. Backend/frontend should build linux/amd64 and linux/arm64; sim-worker may begin amd64-only if Forge dependencies require it.')
+            definition {
+              cpsScm {
+                scm {
+                  git {
+                    remote {
+                      url('https://scm.bstein.dev/bstein/veles.git')
+                      credentials('gitea-pat')
+                    }
+                    branches('*/main')
+                  }
+                }
+                scriptPath('Jenkinsfile')
+              }
+            }
+          }
          multibranchPipelineJob('titan-iac-quality-gate') {
            branchSources {
              branchSource {
--- a/services/keycloak/kustomization.yaml
+++ b/services/keycloak/kustomization.yaml
@ -27,6 +27,7 @@ resources:
  - oneoffs/soteria-oidc-secret-ensure-job.yaml
  - oneoffs/quality-oidc-secret-ensure-job.yaml
  - oneoffs/agent-oidc-secret-ensure-job.yaml
+  - oneoffs/veles-realm-ensure-job.yaml
  - oneoffs/metis-ssh-keys-secret-ensure-job.yaml
  - oneoffs/metis-node-passwords-secret-ensure-job.yaml
  - oneoffs/harbor-oidc-secret-ensure-job.yaml
--- a/services/keycloak/oneoffs/veles-realm-ensure-job.yaml
+++ b/services/keycloak/oneoffs/veles-realm-ensure-job.yaml
@ -0,0 +1,332 @@
+# services/keycloak/oneoffs/veles-realm-ensure-job.yaml
+# One-off job for sso/veles-realm-ensure-1.
+# Purpose: create the Veles realm, groups, OIDC client, SMTP settings, and Vault client secret.
+# Keep suspended until Veles Vault paths/policies have reconciled, then unsuspend once.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: veles-realm-ensure-1
+  namespace: sso
+spec:
+  suspend: true
+  backoffLimit: 0
+  ttlSecondsAfterFinished: 3600
+  template:
+    metadata:
+      annotations:
+        vault.hashicorp.com/agent-inject: "true"
+        vault.hashicorp.com/agent-pre-populate-only: "true"
+        vault.hashicorp.com/role: "sso-secrets"
+        vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin"
+        vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: |
+          {{ with secret "kv/data/atlas/shared/keycloak-admin" }}
+          export KEYCLOAK_ADMIN="{{ .Data.data.username }}"
+          export KEYCLOAK_ADMIN_USER="{{ .Data.data.username }}"
+          export KEYCLOAK_ADMIN_PASSWORD="{{ .Data.data.password }}"
+          {{ end }}
+          {{ with secret "kv/data/atlas/shared/postmark-relay" }}
+          export KEYCLOAK_SMTP_USER="{{ index .Data.data "apikey" }}"
+          export KEYCLOAK_SMTP_PASSWORD="{{ index .Data.data "apikey" }}"
+          {{ end }}
+    spec:
+      serviceAccountName: mas-secrets-ensure
+      restartPolicy: Never
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node-role.kubernetes.io/worker
+                    operator: Exists
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 100
+              preference:
+                matchExpressions:
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values: ["arm64"]
+      containers:
+        - name: configure
+          image: python:3.11-alpine
+          env:
+            - name: KEYCLOAK_SERVER
+              value: http://keycloak.sso.svc.cluster.local
+            - name: KEYCLOAK_REALM
+              value: veles
+            - name: KEYCLOAK_CLIENT_ID
+              value: veles-web
+            - name: KEYCLOAK_PUBLIC_ISSUER
+              value: https://sso.bstein.dev/realms/veles
+            - name: VELES_BASE_URL
+              value: https://veles.bstein.dev
+            - name: KEYCLOAK_SMTP_HOST
+              value: mail.bstein.dev
+            - name: KEYCLOAK_SMTP_PORT
+              value: "587"
+            - name: KEYCLOAK_SMTP_FROM
+              value: no-reply-veles@bstein.dev
+            - name: KEYCLOAK_SMTP_FROM_NAME
+              value: Veles
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              set -eu
+              . /vault/secrets/keycloak-admin-env.sh
+              python - <<'PY'
+              import json
+              import os
+              import time
+              import urllib.error
+              import urllib.parse
+              import urllib.request
+
+              base_url = os.environ["KEYCLOAK_SERVER"].rstrip("/")
+              realm = os.environ["KEYCLOAK_REALM"]
+              client_id = os.environ["KEYCLOAK_CLIENT_ID"]
+              issuer = os.environ["KEYCLOAK_PUBLIC_ISSUER"]
+              veles_base_url = os.environ["VELES_BASE_URL"].rstrip("/")
+              admin_user = os.environ["KEYCLOAK_ADMIN_USER"]
+              admin_password = os.environ["KEYCLOAK_ADMIN_PASSWORD"]
+
+              def request(method, url, token=None, payload=None, headers=None, timeout=30):
+                  data = None
+                  req_headers = headers.copy() if headers else {}
+                  if token:
+                      req_headers["Authorization"] = f"Bearer {token}"
+                  if payload is not None:
+                      data = json.dumps(payload).encode()
+                      req_headers["Content-Type"] = "application/json"
+                  req = urllib.request.Request(url, data=data, headers=req_headers, method=method)
+                  try:
+                      with urllib.request.urlopen(req, timeout=timeout) as resp:
+                          body = resp.read()
+                          if not body:
+                              return resp.status, None
+                          return resp.status, json.loads(body.decode())
+                  except urllib.error.HTTPError as exc:
+                      raw = exc.read()
+                      if not raw:
+                          return exc.code, None
+                      try:
+                          return exc.code, json.loads(raw.decode())
+                      except Exception:
+                          return exc.code, {"raw": raw.decode(errors="replace")}
+
+              token_body = None
+              form = urllib.parse.urlencode(
+                  {
+                      "grant_type": "password",
+                      "client_id": "admin-cli",
+                      "username": admin_user,
+                      "password": admin_password,
+                  }
+              ).encode()
+              for attempt in range(1, 11):
+                  req = urllib.request.Request(
+                      f"{base_url}/realms/master/protocol/openid-connect/token",
+                      data=form,
+                      headers={"Content-Type": "application/x-www-form-urlencoded"},
+                      method="POST",
+                  )
+                  try:
+                      with urllib.request.urlopen(req, timeout=10) as resp:
+                          token_body = json.loads(resp.read().decode())
+                          break
+                  except urllib.error.URLError as exc:
+                      if attempt == 10:
+                          raise SystemExit(f"Keycloak token request failed after retries: {exc}")
+                      time.sleep(attempt * 2)
+              token = token_body["access_token"]
+
+              smtp = {
+                  "host": os.environ["KEYCLOAK_SMTP_HOST"],
+                  "port": os.environ["KEYCLOAK_SMTP_PORT"],
+                  "from": os.environ["KEYCLOAK_SMTP_FROM"],
+                  "fromDisplayName": os.environ["KEYCLOAK_SMTP_FROM_NAME"],
+                  "replyTo": os.environ["KEYCLOAK_SMTP_FROM"],
+                  "replyToDisplayName": os.environ["KEYCLOAK_SMTP_FROM_NAME"],
+                  "user": os.environ["KEYCLOAK_SMTP_USER"],
+                  "password": os.environ["KEYCLOAK_SMTP_PASSWORD"],
+                  "auth": "true",
+                  "starttls": "true",
+                  "ssl": "false",
+              }
+
+              status, realm_rep = request("GET", f"{base_url}/admin/realms/{realm}", token)
+              if status == 404:
+                  create_payload = {
+                      "realm": realm,
+                      "enabled": True,
+                      "registrationAllowed": False,
+                      "resetPasswordAllowed": True,
+                      "verifyEmail": True,
+                      "loginWithEmailAllowed": True,
+                      "duplicateEmailsAllowed": False,
+                      "smtpServer": smtp,
+                  }
+                  status, body = request("POST", f"{base_url}/admin/realms", token, create_payload)
+                  if status not in (201, 204, 409):
+                      raise SystemExit(f"Realm create failed: status={status} body={body}")
+                  status, realm_rep = request("GET", f"{base_url}/admin/realms/{realm}", token)
+              if status != 200 or not isinstance(realm_rep, dict):
+                  raise SystemExit(f"Realm fetch failed: status={status}")
+
+              realm_rep.update(
+                  {
+                      "enabled": True,
+                      "registrationAllowed": False,
+                      "resetPasswordAllowed": True,
+                      "verifyEmail": True,
+                      "loginWithEmailAllowed": True,
+                      "duplicateEmailsAllowed": False,
+                      "smtpServer": smtp,
+                  }
+              )
+              status, body = request("PUT", f"{base_url}/admin/realms/{realm}", token, realm_rep)
+              if status not in (200, 204):
+                  raise SystemExit(f"Realm update failed: status={status} body={body}")
+
+              def ensure_group(name):
+                  status, groups = request(
+                      "GET",
+                      f"{base_url}/admin/realms/{realm}/groups?search={urllib.parse.quote(name)}",
+                      token,
+                  )
+                  if status != 200:
+                      raise SystemExit(f"Group search failed for {name}: status={status}")
+                  if any(group.get("name") == name for group in groups or []):
+                      return
+                  status, body = request("POST", f"{base_url}/admin/realms/{realm}/groups", token, {"name": name})
+                  if status not in (201, 204, 409):
+                      raise SystemExit(f"Group create failed for {name}: status={status} body={body}")
+
+              ensure_group("alpha")
+              ensure_group("admin")
+
+              status, clients = request(
+                  "GET",
+                  f"{base_url}/admin/realms/{realm}/clients?clientId={urllib.parse.quote(client_id)}",
+                  token,
+              )
+              if status != 200:
+                  raise SystemExit(f"Client lookup failed: status={status}")
+              client_uuid = clients[0]["id"] if clients else None
+              client_payload = {
+                  "clientId": client_id,
+                  "enabled": True,
+                  "protocol": "openid-connect",
+                  "publicClient": False,
+                  "standardFlowEnabled": True,
+                  "implicitFlowEnabled": False,
+                  "directAccessGrantsEnabled": False,
+                  "serviceAccountsEnabled": False,
+                  "redirectUris": [f"{veles_base_url}/*"],
+                  "webOrigins": [veles_base_url],
+                  "rootUrl": veles_base_url,
+                  "baseUrl": "/",
+                  "attributes": {
+                      "pkce.code.challenge.method": "S256",
+                      "post.logout.redirect.uris": f"{veles_base_url}/*",
+                  },
+              }
+              if not client_uuid:
+                  status, body = request("POST", f"{base_url}/admin/realms/{realm}/clients", token, client_payload)
+                  if status not in (201, 204, 409):
+                      raise SystemExit(f"Client create failed: status={status} body={body}")
+                  status, clients = request(
+                      "GET",
+                      f"{base_url}/admin/realms/{realm}/clients?clientId={urllib.parse.quote(client_id)}",
+                      token,
+                  )
+                  client_uuid = clients[0]["id"] if clients else None
+              if not client_uuid:
+                  raise SystemExit("Client veles-web not found after create")
+              status, body = request(
+                  "PUT",
+                  f"{base_url}/admin/realms/{realm}/clients/{client_uuid}",
+                  token,
+                  client_payload,
+              )
+              if status not in (200, 204):
+                  raise SystemExit(f"Client update failed: status={status} body={body}")
+
+              mapper_payload = {
+                  "name": "groups",
+                  "protocol": "openid-connect",
+                  "protocolMapper": "oidc-group-membership-mapper",
+                  "consentRequired": False,
+                  "config": {
+                      "full.path": "false",
+                      "id.token.claim": "true",
+                      "access.token.claim": "true",
+                      "userinfo.token.claim": "true",
+                      "claim.name": "groups",
+                  },
+              }
+              status, mappers = request(
+                  "GET",
+                  f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/protocol-mappers/models",
+                  token,
+              )
+              if status != 200:
+                  raise SystemExit(f"Mapper lookup failed: status={status}")
+              mapper_id = next((mapper.get("id") for mapper in mappers or [] if mapper.get("name") == "groups"), None)
+              if mapper_id:
+                  status, body = request(
+                      "PUT",
+                      f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/protocol-mappers/models/{mapper_id}",
+                      token,
+                      mapper_payload,
+                  )
+              else:
+                  status, body = request(
+                      "POST",
+                      f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/protocol-mappers/models",
+                      token,
+                      mapper_payload,
+                  )
+              if status not in (200, 201, 204):
+                  raise SystemExit(f"Mapper ensure failed: status={status} body={body}")
+
+              status, secret = request(
+                  "GET",
+                  f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/client-secret",
+                  token,
+              )
+              client_secret = (secret or {}).get("value")
+              if status != 200 or not client_secret:
+                  raise SystemExit(f"Client secret fetch failed: status={status}")
+
+              vault_addr = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200")
+              jwt = open("/var/run/secrets/kubernetes.io/serviceaccount/token", encoding="utf-8").read().strip()
+              login_payload = json.dumps({"jwt": jwt, "role": os.environ.get("VAULT_ROLE", "sso-secrets")}).encode()
+              req = urllib.request.Request(
+                  f"{vault_addr}/v1/auth/kubernetes/login",
+                  data=login_payload,
+                  headers={"Content-Type": "application/json"},
+                  method="POST",
+              )
+              with urllib.request.urlopen(req, timeout=20) as resp:
+                  vault_token = json.loads(resp.read().decode())["auth"]["client_token"]
+
+              payload = {
+                  "data": {
+                      "client_id": client_id,
+                      "client_secret": client_secret,
+                      "issuer": issuer,
+                      "realm": realm,
+                      "required_groups": "alpha,admin",
+                  }
+              }
+              req = urllib.request.Request(
+                  f"{vault_addr}/v1/kv/data/atlas/veles/veles-oidc",
+                  data=json.dumps(payload).encode(),
+                  headers={"X-Vault-Token": vault_token, "Content-Type": "application/json"},
+                  method="POST",
+              )
+              with urllib.request.urlopen(req, timeout=20) as resp:
+                  if resp.status not in (200, 204):
+                      raise SystemExit(f"Vault write returned {resp.status}")
+
+              print("Veles Keycloak realm/client ready")
+              PY
--- a/services/maintenance/metis-configmap.yaml
+++ b/services/maintenance/metis-configmap.yaml
@ -9,7 +9,7 @@ data:
  METIS_INVENTORY_PATH: /app/inventory.titan-rpi4.yaml
  METIS_DATA_DIR: /var/lib/metis
  METIS_DEFAULT_FLASH_HOST: titan-20
-  METIS_FLASH_HOSTS: titan-20,titan-21,titan-22,titan-24,titan-19,titan-17,titan-15,titan-14,titan-12,titan-11,titan-10,titan-09,titan-08,titan-07,titan-06,titan-05,titan-04,titan-0c,titan-0b,titan-0a
+  METIS_FLASH_HOSTS: titan-20,titan-21,titan-22,titan-23,titan-24,titan-19,titan-17,titan-15,titan-14,titan-12,titan-11,titan-10,titan-09,titan-08,titan-07,titan-06,titan-05,titan-04,titan-0c,titan-0b,titan-0a
  METIS_LOCAL_HOST: titan-20
  METIS_ALLOWED_GROUPS: admin,maintenance
  METIS_MAX_DEVICE_BYTES: "1000000000000"
--- a/services/monitoring/dcgm-exporter.yaml
+++ b/services/monitoring/dcgm-exporter.yaml
@ -38,6 +38,12 @@ spec:
                    operator: NotIn
                    values:
                      - "true"
+                  - key: veles.bstein.dev/node-pool
+                    operator: NotIn
+                    values:
+                      - oceanus
+                  - key: node-role.kubernetes.io/accelerator
+                    operator: Exists
      tolerations:
        - operator: Exists
      containers:
--- a/services/monitoring/helmrelease.yaml
+++ b/services/monitoring/helmrelease.yaml
@ -50,6 +50,15 @@ spec:
  upgrade:
    disableWait: true
  values:
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: veles.bstein.dev/node-pool
+                  operator: NotIn
+                  values:
+                    - oceanus
    rbac:
      pspEnabled: false
    service:
--- a/services/vault/scripts/vault_k8s_auth_configure.sh
+++ b/services/vault/scripts/vault_k8s_auth_configure.sh
@ -240,6 +240,11 @@ write_policy_and_role "game-stream" "game-stream" "game-stream-vault" \
  "game-stream/*" ""
 write_policy_and_role "openclaw" "openclaw" "agent-vault" \
  "openclaw/*" ""
+write_policy_and_role "veles" "veles" "veles-backend,veles-postgres,veles-vault-sync" \
+  "veles/* shared/harbor-pull shared/postmark-relay" ""
+write_policy_and_role "veles-secrets" "veles" "veles-secrets-ensure" \
+  "shared/postmark-relay" \
+  "veles/*"
 write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync,metis" \
  "maintenance/ariadne-db maintenance/metis-oidc maintenance/soteria-oidc maintenance/metis-ssh-keys maintenance/metis-runtime portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull shared/soteria-restic harbor/harbor-core" "" \
  '
@ -266,8 +271,8 @@ write_policy_and_role "vault" "vault" "vault" \
  "vault/*" ""

 write_policy_and_role "sso-secrets" "sso" "mas-secrets-ensure" \
-  "shared/keycloak-admin maintenance/metis-ssh-keys" \
-  "harbor/harbor-oidc vault/vault-oidc-config comms/synapse-oidc logging/oauth2-proxy-logs-oidc finance/actual-oidc maintenance/metis-oidc maintenance/soteria-oidc maintenance/metis-ssh-keys openclaw/agent-oidc" \
+  "shared/keycloak-admin shared/postmark-relay maintenance/metis-ssh-keys" \
+  "harbor/harbor-oidc vault/vault-oidc-config comms/synapse-oidc logging/oauth2-proxy-logs-oidc finance/actual-oidc maintenance/metis-oidc maintenance/soteria-oidc maintenance/metis-ssh-keys openclaw/agent-oidc veles/veles-oidc" \
  '
 path "kv/data/atlas/nodes/*" {
  capabilities = ["create", "update", "read"]
--- a/services/veles/NOTES.md
+++ b/services/veles/NOTES.md
@ -0,0 +1,64 @@
+# Veles Infrastructure Contract
+
+This stack is staged for Flux and intentionally starts the app deployments at `replicas: 0` until images and the app-side runtime contract are ready.
+
+## Cluster Contract
+
+- Namespace: `veles`
+- Hostname: `https://veles.bstein.dev`
+- Namespace: `veles`; no alternate alpha namespace is used.
+- Backend service: `veles-backend.veles.svc.cluster.local:80`
+- Frontend service: `veles-frontend.veles.svc.cluster.local:80`
+- Postgres service: `veles-postgres.veles.svc.cluster.local:5432`
+- Artifact PVC: `veles-artifacts`, mounted at `/data/veles-artifacts`
+- Storage classes: `veles-oceanus-db`, `veles-oceanus-artifacts`
+- Images:
+  - `registry.bstein.dev/veles/veles-backend`
+  - `registry.bstein.dev/veles/veles-frontend`
+  - `registry.bstein.dev/veles/veles-sim-worker`
+
+## Runtime Env
+
+Veles should consume:
+
+- `VELES_PUBLIC_BASE_URL=https://veles.bstein.dev`
+- `VELES_OIDC_ISSUER=https://sso.bstein.dev/realms/veles`
+- `VELES_OIDC_CLIENT_ID=veles-web`
+- `VELES_OIDC_REQUIRED_GROUPS=alpha,admin`
+- `DATABASE_URL` from `kv/data/atlas/veles/veles-db`
+- `VELES_SESSION_SECRET` from `kv/data/atlas/veles/app-secrets`
+- `VELES_BYOK_ENCRYPTION_KEY` from `kv/data/atlas/veles/app-secrets`
+
+User OpenAI API keys must stay in the Veles database encrypted with `VELES_BYOK_ENCRYPTION_KEY`; do not store per-user BYOK secrets in Vault.
+
+## Simulation Jobs
+
+The backend service account can create, watch, and delete Jobs only inside the `veles` namespace. Simulation pods should use service account `veles-sim`, set `automountServiceAccountToken: false`, and use:
+
+```yaml
+priorityClassName: veles-sim
+nodeSelector:
+  veles.bstein.dev/simulation: "true"
+tolerations:
+  - key: veles.bstein.dev/simulation
+    operator: Equal
+    value: "true"
+    effect: NoSchedule
+```
+
+## Staged Operator Steps
+
+1. Join `titan-23`/Oceanus to Atlas as a worker.
+2. Use Metis with `titan-23` in `METIS_FLASH_HOSTS`; the existing node secret placeholder uses `192.168.22.23`.
+3. Confirm the node normalizer applies the Veles labels and taint.
+4. Add Oceanus Longhorn disks at paths tagged by the Longhorn tag ensure job.
+5. Let Vault policy reconciliation run, then unsuspend `veles-secrets-ensure-1`.
+6. Unsuspend `veles-realm-ensure-1` in `services/keycloak` to create the realm/client secret.
+7. Create the Harbor `veles` project or robot access before image automation is enabled in production.
+8. Scale `veles-postgres`, then backend/frontend once app images exist.
+
+## Assumptions
+
+- `veles-oceanus-artifacts` is RWO for alpha; simulation workers should either run on Oceanus with the backend or stream logs to the backend, which owns writes.
+- Postgres uses Longhorn backup recurring jobs off Oceanus. This is not a substitute for a tested restore drill.
+- The Jenkins job skeleton points at the Veles repo but stays disabled until that repo provides a Jenkinsfile.
--- a/services/veles/artifacts-pvc.yaml
+++ b/services/veles/artifacts-pvc.yaml
@ -0,0 +1,16 @@
+# services/veles/artifacts-pvc.yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: veles-artifacts
+  namespace: veles
+  labels:
+    app.kubernetes.io/name: veles
+    app.kubernetes.io/component: artifacts
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: veles-oceanus-artifacts
+  resources:
+    requests:
+      storage: 200Gi
--- a/services/veles/backend-deployment.yaml
+++ b/services/veles/backend-deployment.yaml
@ -0,0 +1,89 @@
+# services/veles/backend-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: veles-backend
+  namespace: veles
+  labels:
+    app: veles-backend
+spec:
+  replicas: 0
+  revisionHistoryLimit: 2
+  selector:
+    matchLabels:
+      app: veles-backend
+  template:
+    metadata:
+      labels:
+        app: veles-backend
+      annotations:
+        vault.hashicorp.com/agent-inject: "true"
+        vault.hashicorp.com/agent-pre-populate-only: "true"
+        vault.hashicorp.com/role: "veles"
+        vault.hashicorp.com/agent-inject-secret-veles-env.sh: "kv/data/atlas/veles/veles-db"
+        vault.hashicorp.com/agent-inject-template-veles-env.sh: |
+          {{- with secret "kv/data/atlas/veles/veles-db" }}
+          export DATABASE_URL="{{ .Data.data.DATABASE_URL }}"
+          export VELES_DATABASE_USER="{{ .Data.data.POSTGRES_USER }}"
+          export VELES_DATABASE_PASSWORD="{{ .Data.data.POSTGRES_PASSWORD }}"
+          {{- end }}
+          {{- with secret "kv/data/atlas/veles/veles-oidc" }}
+          export VELES_OIDC_CLIENT_SECRET="{{ .Data.data.client_secret }}"
+          {{- end }}
+          {{- with secret "kv/data/atlas/veles/app-secrets" }}
+          export VELES_SESSION_SECRET="{{ .Data.data.VELES_SESSION_SECRET }}"
+          export VELES_BYOK_ENCRYPTION_KEY="{{ .Data.data.VELES_BYOK_ENCRYPTION_KEY }}"
+          {{- end }}
+    spec:
+      serviceAccountName: veles-backend
+      priorityClassName: veles-core
+      nodeSelector:
+        veles.bstein.dev/node-pool: oceanus
+      tolerations:
+        - key: veles.bstein.dev/simulation
+          operator: Equal
+          value: "true"
+          effect: NoSchedule
+      securityContext:
+        fsGroup: 1000
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: backend
+          image: registry.bstein.dev/veles/veles-backend:0.1.0-0 # {"$imagepolicy": "veles:veles-backend"}
+          imagePullPolicy: IfNotPresent
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              if [ -f /vault/secrets/veles-env.sh ]; then
+                . /vault/secrets/veles-env.sh
+              fi
+              exec /app/veles-backend
+          ports:
+            - name: http
+              containerPort: 8080
+              protocol: TCP
+          envFrom:
+            - configMapRef:
+                name: veles-app-config
+          resources:
+            requests:
+              cpu: 500m
+              memory: 1Gi
+            limits:
+              cpu: "2"
+              memory: 4Gi
+          securityContext:
+            runAsNonRoot: true
+            runAsUser: 1000
+            runAsGroup: 1000
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+          volumeMounts:
+            - name: artifacts
+              mountPath: /data/veles-artifacts
+      volumes:
+        - name: artifacts
+          persistentVolumeClaim:
+            claimName: veles-artifacts
--- a/services/veles/configmap.yaml
+++ b/services/veles/configmap.yaml
@ -0,0 +1,23 @@
+# services/veles/configmap.yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: veles-app-config
+  namespace: veles
+data:
+  VELES_ENV: alpha
+  VELES_PUBLIC_BASE_URL: https://veles.bstein.dev
+  VELES_OIDC_ISSUER: https://sso.bstein.dev/realms/veles
+  VELES_OIDC_CLIENT_ID: veles-web
+  VELES_OIDC_REQUIRED_GROUPS: alpha,admin
+  VELES_DATABASE_HOST: veles-postgres.veles.svc.cluster.local
+  VELES_DATABASE_PORT: "5432"
+  VELES_DATABASE_NAME: veles
+  VELES_ARTIFACTS_PATH: /data/veles-artifacts
+  VELES_SIM_NAMESPACE: veles
+  VELES_SIM_SERVICE_ACCOUNT: veles-sim
+  VELES_SIM_PRIORITY_CLASS: veles-sim
+  VELES_SIM_NODE_SELECTOR: veles.bstein.dev/simulation=true
+  VELES_SIM_TOLERATION_KEY: veles.bstein.dev/simulation
+  VELES_SIM_TOLERATION_VALUE: "true"
+  VELES_LOG_RETENTION_DAYS: "30"
--- a/services/veles/frontend-deployment.yaml
+++ b/services/veles/frontend-deployment.yaml
@ -0,0 +1,72 @@
+# services/veles/frontend-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: veles-frontend
+  namespace: veles
+  labels:
+    app: veles-frontend
+spec:
+  replicas: 0
+  revisionHistoryLimit: 2
+  selector:
+    matchLabels:
+      app: veles-frontend
+  template:
+    metadata:
+      labels:
+        app: veles-frontend
+    spec:
+      serviceAccountName: veles-frontend
+      priorityClassName: veles-core
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node-role.kubernetes.io/worker
+                    operator: Exists
+                  - key: hardware
+                    operator: In
+                    values: ["rpi5", "rpi4", "amd64"]
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 100
+              preference:
+                matchExpressions:
+                  - key: atlas.bstein.dev/spillover
+                    operator: DoesNotExist
+            - weight: 90
+              preference:
+                matchExpressions:
+                  - key: hardware
+                    operator: In
+                    values: ["rpi5"]
+      securityContext:
+        fsGroup: 1000
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: frontend
+          image: registry.bstein.dev/veles/veles-frontend:0.1.0-0 # {"$imagepolicy": "veles:veles-frontend"}
+          imagePullPolicy: IfNotPresent
+          ports:
+            - name: http
+              containerPort: 8080
+              protocol: TCP
+          envFrom:
+            - configMapRef:
+                name: veles-app-config
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+          securityContext:
+            runAsNonRoot: true
+            runAsUser: 1000
+            runAsGroup: 1000
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
--- a/services/veles/image.yaml
+++ b/services/veles/image.yaml
@ -0,0 +1,69 @@
+# services/veles/image.yaml
+apiVersion: image.toolkit.fluxcd.io/v1
+kind: ImageRepository
+metadata:
+  name: veles-backend
+  namespace: veles
+spec:
+  image: registry.bstein.dev/veles/veles-backend
+  interval: 1m0s
+  secretRef:
+    name: harbor-regcred
+---
+apiVersion: image.toolkit.fluxcd.io/v1
+kind: ImagePolicy
+metadata:
+  name: veles-backend
+  namespace: veles
+spec:
+  imageRepositoryRef:
+    name: veles-backend
+  policy:
+    semver:
+      range: ">=0.1.0-0"
+---
+apiVersion: image.toolkit.fluxcd.io/v1
+kind: ImageRepository
+metadata:
+  name: veles-frontend
+  namespace: veles
+spec:
+  image: registry.bstein.dev/veles/veles-frontend
+  interval: 1m0s
+  secretRef:
+    name: harbor-regcred
+---
+apiVersion: image.toolkit.fluxcd.io/v1
+kind: ImagePolicy
+metadata:
+  name: veles-frontend
+  namespace: veles
+spec:
+  imageRepositoryRef:
+    name: veles-frontend
+  policy:
+    semver:
+      range: ">=0.1.0-0"
+---
+apiVersion: image.toolkit.fluxcd.io/v1
+kind: ImageRepository
+metadata:
+  name: veles-sim-worker
+  namespace: veles
+spec:
+  image: registry.bstein.dev/veles/veles-sim-worker
+  interval: 1m0s
+  secretRef:
+    name: harbor-regcred
+---
+apiVersion: image.toolkit.fluxcd.io/v1
+kind: ImagePolicy
+metadata:
+  name: veles-sim-worker
+  namespace: veles
+spec:
+  imageRepositoryRef:
+    name: veles-sim-worker
+  policy:
+    semver:
+      range: ">=0.1.0-0"
--- a/services/veles/ingress.yaml
+++ b/services/veles/ingress.yaml
@ -0,0 +1,47 @@
+# services/veles/ingress.yaml
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: veles
+  namespace: veles
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt
+    traefik.ingress.kubernetes.io/router.entrypoints: websecure
+    traefik.ingress.kubernetes.io/router.tls: "true"
+spec:
+  ingressClassName: traefik
+  tls:
+    - hosts: ["veles.bstein.dev"]
+      secretName: veles-tls
+  rules:
+    - host: veles.bstein.dev
+      http:
+        paths:
+          - path: /api
+            pathType: Prefix
+            backend:
+              service:
+                name: veles-backend
+                port:
+                  number: 80
+          - path: /events
+            pathType: Prefix
+            backend:
+              service:
+                name: veles-backend
+                port:
+                  number: 80
+          - path: /ws
+            pathType: Prefix
+            backend:
+              service:
+                name: veles-backend
+                port:
+                  number: 80
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: veles-frontend
+                port:
+                  number: 80
--- a/services/veles/kustomization.yaml
+++ b/services/veles/kustomization.yaml
@ -0,0 +1,22 @@
+# services/veles/kustomization.yaml
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: veles
+resources:
+  - namespace.yaml
+  - serviceaccounts.yaml
+  - secretproviderclass.yaml
+  - vault-sync-deployment.yaml
+  - resourcequota.yaml
+  - limitrange.yaml
+  - configmap.yaml
+  - rbac.yaml
+  - artifacts-pvc.yaml
+  - postgres-service.yaml
+  - postgres-statefulset.yaml
+  - services.yaml
+  - backend-deployment.yaml
+  - frontend-deployment.yaml
+  - image.yaml
+  - ingress.yaml
+  - oneoffs/veles-secrets-ensure-job.yaml
--- a/services/veles/limitrange.yaml
+++ b/services/veles/limitrange.yaml
@ -0,0 +1,21 @@
+# services/veles/limitrange.yaml
+apiVersion: v1
+kind: LimitRange
+metadata:
+  name: veles-container-limits
+  namespace: veles
+spec:
+  limits:
+    - type: Container
+      defaultRequest:
+        cpu: 100m
+        memory: 256Mi
+      default:
+        cpu: 500m
+        memory: 512Mi
+      min:
+        cpu: 10m
+        memory: 32Mi
+      max:
+        cpu: "16"
+        memory: 32Gi
--- a/services/veles/namespace.yaml
+++ b/services/veles/namespace.yaml
@ -0,0 +1,8 @@
+# services/veles/namespace.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: veles
+  labels:
+    app.kubernetes.io/name: veles
+    app.kubernetes.io/part-of: veles
--- a/services/veles/oneoffs/veles-secrets-ensure-job.yaml
+++ b/services/veles/oneoffs/veles-secrets-ensure-job.yaml
@ -0,0 +1,142 @@
+# services/veles/oneoffs/veles-secrets-ensure-job.yaml
+# One-off job for veles/veles-secrets-ensure-1.
+# Purpose: seed Veles Vault paths before app/Postgres pods are scaled up.
+# Keep suspended until the veles Vault role has reconciled, then unsuspend once.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: veles-secrets-ensure-1
+  namespace: veles
+spec:
+  suspend: true
+  backoffLimit: 0
+  ttlSecondsAfterFinished: 3600
+  template:
+    spec:
+      serviceAccountName: veles-secrets-ensure
+      restartPolicy: Never
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node-role.kubernetes.io/worker
+                    operator: Exists
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 100
+              preference:
+                matchExpressions:
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values: ["arm64"]
+      containers:
+        - name: apply
+          image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              set -euo pipefail
+
+              vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
+              vault_role="${VAULT_ROLE:-veles-secrets}"
+              jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
+              login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
+              vault_token="$(curl -sS --request POST --data "${login_payload}" \
+                "${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
+              if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
+                echo "vault login failed" >&2
+                exit 1
+              fi
+
+              read_secret() {
+                path="$1"
+                out="$2"
+                curl -sS -o "${out}" -w "%{http_code}" \
+                  -H "X-Vault-Token: ${vault_token}" \
+                  "${vault_addr}/v1/kv/data/atlas/${path}" || true
+              }
+
+              write_secret() {
+                path="$1"
+                payload="$2"
+                out="$(mktemp)"
+                status="$(curl -sS -o "${out}" -w "%{http_code}" -X POST \
+                  -H "X-Vault-Token: ${vault_token}" \
+                  -H "Content-Type: application/json" \
+                  -d "${payload}" \
+                  "${vault_addr}/v1/kv/data/atlas/${path}")"
+                if [ "${status}" != "200" ] && [ "${status}" != "204" ]; then
+                  echo "Vault write failed for ${path} (status ${status})" >&2
+                  cat "${out}" >&2 || true
+                  exit 1
+                fi
+              }
+
+              rand_b64() {
+                bytes="$1"
+                openssl rand -base64 "${bytes}" | tr -d '\n'
+              }
+
+              status="$(read_secret veles/veles-db /tmp/veles-db.json)"
+              if [ "${status}" = "200" ]; then
+                db_password="$(jq -r '.data.data.POSTGRES_PASSWORD // empty' /tmp/veles-db.json)"
+              elif [ "${status}" = "404" ]; then
+                db_password=""
+              else
+                echo "Vault read failed for veles-db (status ${status})" >&2
+                cat /tmp/veles-db.json >&2 || true
+                exit 1
+              fi
+              if [ -z "${db_password}" ]; then
+                db_password="$(rand_b64 36)"
+              fi
+              db_payload="$(jq -nc \
+                --arg host "veles-postgres.veles.svc.cluster.local" \
+                --arg port "5432" \
+                --arg db "veles" \
+                --arg user "veles" \
+                --arg password "${db_password}" \
+                '{data:{POSTGRES_HOST:$host,POSTGRES_PORT:$port,POSTGRES_DB:$db,POSTGRES_USER:$user,POSTGRES_PASSWORD:$password,DATABASE_URL:("postgresql://"+$user+":"+$password+"@"+$host+":"+$port+"/"+$db+"?sslmode=disable")}}')"
+              write_secret veles/veles-db "${db_payload}"
+
+              status="$(read_secret veles/app-secrets /tmp/app-secrets.json)"
+              if [ "${status}" = "200" ]; then
+                session_secret="$(jq -r '.data.data.VELES_SESSION_SECRET // empty' /tmp/app-secrets.json)"
+                byok_key="$(jq -r '.data.data.VELES_BYOK_ENCRYPTION_KEY // empty' /tmp/app-secrets.json)"
+              elif [ "${status}" = "404" ]; then
+                session_secret=""
+                byok_key=""
+              else
+                echo "Vault read failed for app-secrets (status ${status})" >&2
+                cat /tmp/app-secrets.json >&2 || true
+                exit 1
+              fi
+              if [ -z "${session_secret}" ]; then
+                session_secret="$(rand_b64 48)"
+              fi
+              if [ -z "${byok_key}" ]; then
+                byok_key="$(rand_b64 32)"
+              fi
+              app_payload="$(jq -nc \
+                --arg session_secret "${session_secret}" \
+                --arg byok_key "${byok_key}" \
+                '{data:{VELES_SESSION_SECRET:$session_secret,VELES_BYOK_ENCRYPTION_KEY:$byok_key}}')"
+              write_secret veles/app-secrets "${app_payload}"
+
+              postmark_status="$(read_secret shared/postmark-relay /tmp/postmark.json)"
+              if [ "${postmark_status}" = "200" ]; then
+                smtp_password="$(jq -r '.data.data.apikey // empty' /tmp/postmark.json)"
+                if [ -n "${smtp_password}" ]; then
+                  smtp_payload="$(jq -nc \
+                    --arg host "mail.bstein.dev" \
+                    --arg port "587" \
+                    --arg user "${smtp_password}" \
+                    --arg password "${smtp_password}" \
+                    --arg from "no-reply-veles@bstein.dev" \
+                    --arg from_name "Veles" \
+                    '{data:{SMTP_HOST:$host,SMTP_PORT:$port,SMTP_USER:$user,SMTP_PASSWORD:$password,SMTP_FROM:$from,SMTP_FROM_NAME:$from_name,SMTP_STARTTLS:"true"}}')"
+                  write_secret veles/smtp "${smtp_payload}"
+                fi
+              fi
+
+              echo "Veles Vault paths ready: veles-db, app-secrets, smtp when Postmark relay exists"
--- a/services/veles/postgres-service.yaml
+++ b/services/veles/postgres-service.yaml
@ -0,0 +1,17 @@
+# services/veles/postgres-service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: veles-postgres
+  namespace: veles
+  labels:
+    app: veles-postgres
+spec:
+  clusterIP: None
+  ports:
+    - name: postgres
+      port: 5432
+      protocol: TCP
+      targetPort: 5432
+  selector:
+    app: veles-postgres
--- a/services/veles/postgres-statefulset.yaml
+++ b/services/veles/postgres-statefulset.yaml
@ -0,0 +1,88 @@
+# services/veles/postgres-statefulset.yaml
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: veles-postgres
+  namespace: veles
+  labels:
+    app: veles-postgres
+spec:
+  serviceName: veles-postgres
+  replicas: 0
+  selector:
+    matchLabels:
+      app: veles-postgres
+  persistentVolumeClaimRetentionPolicy:
+    whenDeleted: Retain
+    whenScaled: Retain
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        app: veles-postgres
+      annotations:
+        vault.hashicorp.com/agent-inject: "true"
+        vault.hashicorp.com/agent-pre-populate-only: "true"
+        vault.hashicorp.com/role: "veles"
+        vault.hashicorp.com/agent-inject-secret-postgres-password: "kv/data/atlas/veles/veles-db"
+        vault.hashicorp.com/agent-inject-template-postgres-password: |
+          {{- with secret "kv/data/atlas/veles/veles-db" -}}
+          {{ .Data.data.POSTGRES_PASSWORD }}
+          {{- end -}}
+    spec:
+      serviceAccountName: veles-postgres
+      priorityClassName: veles-core
+      nodeSelector:
+        veles.bstein.dev/node-pool: oceanus
+      tolerations:
+        - key: veles.bstein.dev/simulation
+          operator: Equal
+          value: "true"
+          effect: NoSchedule
+      securityContext:
+        fsGroup: 999
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: postgres
+          image: postgres:15
+          ports:
+            - name: postgres
+              containerPort: 5432
+              protocol: TCP
+          env:
+            - name: PGDATA
+              value: /var/lib/postgresql/data/pgdata
+            - name: POSTGRES_USER
+              value: veles
+            - name: POSTGRES_PASSWORD_FILE
+              value: /vault/secrets/postgres-password
+            - name: POSTGRES_DB
+              value: veles
+          resources:
+            requests:
+              cpu: "2"
+              memory: 8Gi
+            limits:
+              cpu: "4"
+              memory: 16Gi
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+          volumeMounts:
+            - name: postgres-data
+              mountPath: /var/lib/postgresql/data
+  volumeClaimTemplates:
+    - metadata:
+        name: postgres-data
+        labels:
+          app: veles-postgres
+          veles.bstein.dev/backup: longhorn
+      spec:
+        accessModes: ["ReadWriteOnce"]
+        storageClassName: veles-oceanus-db
+        resources:
+          requests:
+            storage: 100Gi
--- a/services/veles/rbac.yaml
+++ b/services/veles/rbac.yaml
@ -0,0 +1,36 @@
+# services/veles/rbac.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: veles-backend-jobs
+  namespace: veles
+rules:
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["create", "delete", "deletecollection", "get", "list", "patch", "watch"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["delete", "get", "list", "watch"]
+  - apiGroups: [""]
+    resources: ["pods/log"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: [""]
+    resources: ["events"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["events.k8s.io"]
+    resources: ["events"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: veles-backend-jobs
+  namespace: veles
+subjects:
+  - kind: ServiceAccount
+    name: veles-backend
+    namespace: veles
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: veles-backend-jobs
--- a/services/veles/resourcequota.yaml
+++ b/services/veles/resourcequota.yaml
@ -0,0 +1,54 @@
+# services/veles/resourcequota.yaml
+apiVersion: v1
+kind: ResourceQuota
+metadata:
+  name: veles-namespace-quota
+  namespace: veles
+spec:
+  hard:
+    requests.cpu: "12"
+    requests.memory: 24Gi
+    limits.cpu: "40"
+    limits.memory: 96Gi
+    pods: "60"
+    count/jobs.batch: "100"
+    persistentvolumeclaims: "8"
+    requests.storage: 300Gi
+---
+apiVersion: v1
+kind: ResourceQuota
+metadata:
+  name: veles-core-quota
+  namespace: veles
+spec:
+  hard:
+    requests.cpu: "4"
+    requests.memory: 12Gi
+    limits.cpu: "8"
+    limits.memory: 24Gi
+    pods: "12"
+  scopeSelector:
+    matchExpressions:
+      - scopeName: PriorityClass
+        operator: In
+        values:
+          - veles-core
+---
+apiVersion: v1
+kind: ResourceQuota
+metadata:
+  name: veles-sim-quota
+  namespace: veles
+spec:
+  hard:
+    requests.cpu: "8"
+    requests.memory: 16Gi
+    limits.cpu: "32"
+    limits.memory: 72Gi
+    pods: "48"
+  scopeSelector:
+    matchExpressions:
+      - scopeName: PriorityClass
+        operator: In
+        values:
+          - veles-sim
--- a/services/veles/secretproviderclass.yaml
+++ b/services/veles/secretproviderclass.yaml
@ -0,0 +1,21 @@
+# services/veles/secretproviderclass.yaml
+apiVersion: secrets-store.csi.x-k8s.io/v1
+kind: SecretProviderClass
+metadata:
+  name: veles-vault
+  namespace: veles
+spec:
+  provider: vault
+  parameters:
+    vaultAddress: "http://vault.vault.svc.cluster.local:8200"
+    roleName: "veles"
+    objects: |
+      - objectName: "harbor-pull__dockerconfigjson"
+        secretPath: "kv/data/atlas/shared/harbor-pull"
+        secretKey: "dockerconfigjson"
+  secretObjects:
+    - secretName: harbor-regcred
+      type: kubernetes.io/dockerconfigjson
+      data:
+        - objectName: harbor-pull__dockerconfigjson
+          key: .dockerconfigjson
--- a/services/veles/serviceaccounts.yaml
+++ b/services/veles/serviceaccounts.yaml
@ -0,0 +1,45 @@
+# services/veles/serviceaccounts.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: veles-backend
+  namespace: veles
+imagePullSecrets:
+  - name: harbor-regcred
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: veles-frontend
+  namespace: veles
+imagePullSecrets:
+  - name: harbor-regcred
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: veles-postgres
+  namespace: veles
+imagePullSecrets:
+  - name: harbor-regcred
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: veles-vault-sync
+  namespace: veles
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: veles-secrets-ensure
+  namespace: veles
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: veles-sim
+  namespace: veles
+automountServiceAccountToken: false
+imagePullSecrets:
+  - name: harbor-regcred
--- a/services/veles/services.yaml
+++ b/services/veles/services.yaml
@ -0,0 +1,32 @@
+# services/veles/services.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: veles-backend
+  namespace: veles
+  labels:
+    app: veles-backend
+spec:
+  ports:
+    - name: http
+      port: 80
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    app: veles-backend
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: veles-frontend
+  namespace: veles
+  labels:
+    app: veles-frontend
+spec:
+  ports:
+    - name: http
+      port: 80
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    app: veles-frontend
--- a/services/veles/vault-sync-deployment.yaml
+++ b/services/veles/vault-sync-deployment.yaml
@ -0,0 +1,43 @@
+# services/veles/vault-sync-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: veles-vault-sync
+  namespace: veles
+  labels:
+    app: veles-vault-sync
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: veles-vault-sync
+  template:
+    metadata:
+      labels:
+        app: veles-vault-sync
+    spec:
+      serviceAccountName: veles-vault-sync
+      containers:
+        - name: sync
+          image: alpine:3.20
+          command: ["/bin/sh", "-c"]
+          args:
+            - "sleep infinity"
+          resources:
+            requests:
+              cpu: 10m
+              memory: 16Mi
+            limits:
+              cpu: 50m
+              memory: 64Mi
+          volumeMounts:
+            - name: vault-secrets
+              mountPath: /vault/secrets
+              readOnly: true
+      volumes:
+        - name: vault-secrets
+          csi:
+            driver: secrets-store.csi.k8s.io
+            readOnly: true
+            volumeAttributes:
+              secretProviderClass: veles-vault