diff --git a/clusters/atlas/flux-system/applications/kustomization.yaml b/clusters/atlas/flux-system/applications/kustomization.yaml index 32229f61..4b87835a 100644 --- a/clusters/atlas/flux-system/applications/kustomization.yaml +++ b/clusters/atlas/flux-system/applications/kustomization.yaml @@ -28,6 +28,7 @@ resources: - ai-llm/kustomization.yaml - openclaw/kustomization.yaml - game-stream/kustomization.yaml + - veles/kustomization.yaml - typhon/kustomization.yaml - nextcloud/kustomization.yaml - nextcloud-mail-sync/kustomization.yaml diff --git a/clusters/atlas/flux-system/applications/veles/image-automation.yaml b/clusters/atlas/flux-system/applications/veles/image-automation.yaml new file mode 100644 index 00000000..7917584b --- /dev/null +++ b/clusters/atlas/flux-system/applications/veles/image-automation.yaml @@ -0,0 +1,29 @@ +# clusters/atlas/flux-system/applications/veles/image-automation.yaml +# Staged for the first Veles image rollout. Add this file to the parent +# applications kustomization after the namespace exists and the Harbor repos +# have initial tags. +apiVersion: image.toolkit.fluxcd.io/v1 +kind: ImageUpdateAutomation +metadata: + name: veles + namespace: veles +spec: + interval: 1m0s + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + git: + checkout: + ref: + branch: main + commit: + author: + email: ops@bstein.dev + name: flux-bot + messageTemplate: "chore(veles): automated image update" + push: + branch: main + update: + strategy: Setters + path: services/veles diff --git a/clusters/atlas/flux-system/applications/veles/kustomization.yaml b/clusters/atlas/flux-system/applications/veles/kustomization.yaml new file mode 100644 index 00000000..fdb09cc1 --- /dev/null +++ b/clusters/atlas/flux-system/applications/veles/kustomization.yaml @@ -0,0 +1,28 @@ +# clusters/atlas/flux-system/applications/veles/kustomization.yaml +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: veles + namespace: flux-system + annotations: + kustomize.toolkit.fluxcd.io/ssa: IfNotPresent +spec: + interval: 10m + path: ./services/veles + targetNamespace: veles + prune: true + sourceRef: + kind: GitRepository + name: flux-system + namespace: flux-system + dependsOn: + - name: cert-manager + - name: core + - name: keycloak + - name: longhorn + - name: traefik + - name: vault + - name: vault-csi + - name: vault-injector + wait: false + timeout: 20m diff --git a/infrastructure/core/node-prefer-noschedule-cronjob.yaml b/infrastructure/core/node-prefer-noschedule-cronjob.yaml index 2cad93d5..405c3bbb 100644 --- a/infrastructure/core/node-prefer-noschedule-cronjob.yaml +++ b/infrastructure/core/node-prefer-noschedule-cronjob.yaml @@ -55,6 +55,20 @@ spec: k label node titan-22 atlas.bstein.dev/general-compute=last-resort --overwrite=true || true fi + if k get node titan-23 >/dev/null 2>&1; then + k label node titan-23 \ + veles.bstein.dev/simulation=true \ + veles.bstein.dev/node-pool=oceanus \ + node-role.kubernetes.io/veles-sim=true \ + longhorn-host=true \ + hardware=oceanus \ + --overwrite=true || true + k label node titan-23 node-role.kubernetes.io/worker- || true + k taint node titan-23 veles.bstein.dev/simulation=true:NoSchedule --overwrite=true || true + else + echo "skipping missing node titan-23" + fi + for node in titan-13 titan-15 titan-17 titan-19; do if k get node "${node}" >/dev/null 2>&1; then k label node "${node}" atlas.bstein.dev/spillover=true longhorn-host=true --overwrite=true || true diff --git a/infrastructure/longhorn/core/helmrelease.yaml b/infrastructure/longhorn/core/helmrelease.yaml index 48d1ca48..624e0da1 100644 --- a/infrastructure/longhorn/core/helmrelease.yaml +++ b/infrastructure/longhorn/core/helmrelease.yaml @@ -81,7 +81,13 @@ spec: tag: v2.16.0 defaultSettings: systemManagedPodsImagePullPolicy: Always + taintToleration: veles.bstein.dev/simulation=true:NoSchedule longhornManager: + tolerations: + - key: veles.bstein.dev/simulation + operator: Equal + value: "true" + effect: NoSchedule nodeSelector: longhorn-host: "true" longhornDriver: diff --git a/infrastructure/longhorn/core/kustomization.yaml b/infrastructure/longhorn/core/kustomization.yaml index 6b0c572e..c60b4090 100644 --- a/infrastructure/longhorn/core/kustomization.yaml +++ b/infrastructure/longhorn/core/kustomization.yaml @@ -7,6 +7,7 @@ resources: - secretproviderclass.yaml - vault-sync-deployment.yaml - helmrelease.yaml + - veles-recurring-jobs.yaml - longhorn-settings-ensure-job.yaml - longhorn-disk-tags-ensure-job.yaml diff --git a/infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml b/infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml index ec0bf098..0df5a6ad 100644 --- a/infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml +++ b/infrastructure/longhorn/core/longhorn-disk-tags-ensure-job.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: longhorn-disk-tags-ensure-1 + name: longhorn-disk-tags-ensure-3 namespace: longhorn-system spec: backoffLimit: 0 diff --git a/infrastructure/longhorn/core/scripts/longhorn_disk_tags_ensure.py b/infrastructure/longhorn/core/scripts/longhorn_disk_tags_ensure.py index 48a41a51..65f69461 100644 --- a/infrastructure/longhorn/core/scripts/longhorn_disk_tags_ensure.py +++ b/infrastructure/longhorn/core/scripts/longhorn_disk_tags_ensure.py @@ -17,10 +17,28 @@ import urllib.request LONGHORN_NS = "longhorn-system" LONGHORN_API = "/apis/longhorn.io/v1beta2/namespaces/{namespace}/nodes" -DESIRED_TAGS = { - "/mnt/astreae": "astreae", - "/mnt/asteria": "asteria", +DESIRED_DISK_TAGS = { + "/mnt/astreae": ["astreae"], + "/mnt/asteria": ["asteria"], + "/mnt/veles": ["veles-oceanus", "veles-db", "veles-artifacts"], + "/mnt/veles-db": ["veles-oceanus", "veles-db"], + "/mnt/veles-artifacts": ["veles-oceanus", "veles-artifacts"], } +DESIRED_NODE_TAGS = { + "titan-23": ["veles-oceanus"], +} +DESIRED_NODE_DISKS = { + "titan-23": { + "veles-oceanus": { + "path": "/mnt/veles", + "allowScheduling": True, + "evictionRequested": False, + "storageReserved": 0, + "tags": ["veles-oceanus", "veles-db", "veles-artifacts"], + } + } +} +DISABLE_DEFAULT_DISK_NODES = {"titan-23"} def api_base() -> str: @@ -63,8 +81,30 @@ def list_nodes() -> list[dict]: return data.get("items", []) -def patch_disk_tags(node_name: str, disk_name: str, desired_tag: str) -> None: - body = {"spec": {"disks": {disk_name: {"tags": [desired_tag]}}}} +def merged_tags(current_tags: list[str], desired_tags: list[str]) -> list[str]: + return sorted(dict.fromkeys([*current_tags, *desired_tags])) + + +def patch_node_tags(node_name: str, desired_tags: list[str]) -> None: + body = {"spec": {"tags": desired_tags}} + request_json( + "PATCH", + f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}", + body=body, + ) + + +def patch_disk_tags(node_name: str, disk_name: str, desired_tags: list[str]) -> None: + body = {"spec": {"disks": {disk_name: {"tags": desired_tags}}}} + request_json( + "PATCH", + f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}", + body=body, + ) + + +def patch_disks(node_name: str, disks: dict) -> None: + body = {"spec": {"disks": disks}} request_json( "PATCH", f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}", @@ -78,18 +118,52 @@ def main() -> int: for node in list_nodes(): name = node.get("metadata", {}).get("name", "") + desired_node_tags = DESIRED_NODE_TAGS.get(name) + if desired_node_tags: + current_node_tags = node.get("spec", {}).get("tags") or [] + next_node_tags = merged_tags(current_node_tags, desired_node_tags) + if current_node_tags != next_node_tags: + print(f"patching {name} node tags={current_node_tags!r} -> {next_node_tags!r}") + patch_node_tags(name, next_node_tags) + changed += 1 + else: + skipped += 1 + spec_disks = node.get("spec", {}).get("disks", {}) or {} + desired_disks = DESIRED_NODE_DISKS.get(name, {}) + missing_disks = { + disk_name: disk_spec + for disk_name, disk_spec in desired_disks.items() + if disk_name not in spec_disks + } + if missing_disks: + print(f"adding {name} disks={sorted(missing_disks)}") + patch_disks(name, missing_disks) + changed += len(missing_disks) + spec_disks = {**spec_disks, **missing_disks} + + if name in DISABLE_DEFAULT_DISK_NODES: + disable_patch = {} + for disk_name, disk in spec_disks.items(): + disk_path = (disk.get("path") or "").rstrip("/") + if disk_path == "/var/lib/longhorn" and disk.get("allowScheduling", True): + disable_patch[disk_name] = {"allowScheduling": False} + if disable_patch: + print(f"disabling default Longhorn scheduling on {name} disks={sorted(disable_patch)}") + patch_disks(name, disable_patch) + changed += len(disable_patch) + for disk_name, disk in spec_disks.items(): disk_path = disk.get("path") - desired_tag = DESIRED_TAGS.get(disk_path) - if not desired_tag: + desired_disk_tags = DESIRED_DISK_TAGS.get(disk_path) + if not desired_disk_tags: continue current_tags = disk.get("tags") or [] - if current_tags == [desired_tag]: + if current_tags == desired_disk_tags: skipped += 1 continue - print(f"patching {name}:{disk_name} path={disk_path} tags={current_tags!r} -> {[desired_tag]!r}") - patch_disk_tags(name, disk_name, desired_tag) + print(f"patching {name}:{disk_name} path={disk_path} tags={current_tags!r} -> {desired_disk_tags!r}") + patch_disk_tags(name, disk_name, desired_disk_tags) changed += 1 print(f"done: changed={changed} skipped={skipped}") diff --git a/infrastructure/longhorn/core/veles-recurring-jobs.yaml b/infrastructure/longhorn/core/veles-recurring-jobs.yaml new file mode 100644 index 00000000..ca4f08eb --- /dev/null +++ b/infrastructure/longhorn/core/veles-recurring-jobs.yaml @@ -0,0 +1,28 @@ +# infrastructure/longhorn/core/veles-recurring-jobs.yaml +apiVersion: longhorn.io/v1beta2 +kind: RecurringJob +metadata: + name: veles-postgres-backup + namespace: longhorn-system +spec: + cron: "30 5 * * *" + task: backup + groups: + - veles + - veles-postgres + retain: 7 + concurrency: 1 +--- +apiVersion: longhorn.io/v1beta2 +kind: RecurringJob +metadata: + name: veles-postgres-snapshot + namespace: longhorn-system +spec: + cron: "*/30 * * * *" + task: snapshot + groups: + - veles + - veles-postgres + retain: 8 + concurrency: 1 diff --git a/infrastructure/modules/base/priorityclass/kustomization.yaml b/infrastructure/modules/base/priorityclass/kustomization.yaml index 4524ab96..3d486b4e 100644 --- a/infrastructure/modules/base/priorityclass/kustomization.yaml +++ b/infrastructure/modules/base/priorityclass/kustomization.yaml @@ -3,3 +3,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - scavenger.yaml + - veles.yaml diff --git a/infrastructure/modules/base/priorityclass/veles.yaml b/infrastructure/modules/base/priorityclass/veles.yaml new file mode 100644 index 00000000..0b08f37f --- /dev/null +++ b/infrastructure/modules/base/priorityclass/veles.yaml @@ -0,0 +1,17 @@ +# infrastructure/modules/base/priorityclass/veles.yaml +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: veles-core +value: 500 +globalDefault: false +description: "For Veles core database, API, and controller workloads" +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: veles-sim +value: 50 +globalDefault: false +preemptionPolicy: Never +description: "For Veles simulation jobs; lower than core and non-preempting" diff --git a/infrastructure/modules/base/storageclass/kustomization.yaml b/infrastructure/modules/base/storageclass/kustomization.yaml index 44d79c70..a4c111a1 100644 --- a/infrastructure/modules/base/storageclass/kustomization.yaml +++ b/infrastructure/modules/base/storageclass/kustomization.yaml @@ -5,3 +5,6 @@ resources: - asteria.yaml - asteria-encrypted.yaml - astreae.yaml + - veles-oceanus-db.yaml + - veles-oceanus-artifacts.yaml + - veles-oceanus-policy.yaml diff --git a/infrastructure/modules/base/storageclass/veles-oceanus-artifacts.yaml b/infrastructure/modules/base/storageclass/veles-oceanus-artifacts.yaml new file mode 100644 index 00000000..48501743 --- /dev/null +++ b/infrastructure/modules/base/storageclass/veles-oceanus-artifacts.yaml @@ -0,0 +1,20 @@ +# infrastructure/modules/base/storageclass/veles-oceanus-artifacts.yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: veles-oceanus-artifacts + annotations: + veles.bstein.dev/allowed-namespace: veles +provisioner: driver.longhorn.io +parameters: + nodeSelector: veles-oceanus + diskSelector: veles-oceanus,veles-artifacts + fromBackup: "" + numberOfReplicas: "1" + staleReplicaTimeout: "30" + fsType: ext4 + replicaAutoBalance: disabled + dataLocality: strict-local +reclaimPolicy: Retain +allowVolumeExpansion: true +volumeBindingMode: WaitForFirstConsumer diff --git a/infrastructure/modules/base/storageclass/veles-oceanus-db.yaml b/infrastructure/modules/base/storageclass/veles-oceanus-db.yaml new file mode 100644 index 00000000..a9e1386e --- /dev/null +++ b/infrastructure/modules/base/storageclass/veles-oceanus-db.yaml @@ -0,0 +1,21 @@ +# infrastructure/modules/base/storageclass/veles-oceanus-db.yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: veles-oceanus-db + annotations: + veles.bstein.dev/allowed-namespace: veles +provisioner: driver.longhorn.io +parameters: + nodeSelector: veles-oceanus + diskSelector: veles-oceanus,veles-db + fromBackup: "" + numberOfReplicas: "1" + staleReplicaTimeout: "30" + fsType: ext4 + replicaAutoBalance: disabled + dataLocality: strict-local + recurringJobSelector: '[{"name":"veles-postgres-backup","isGroup":false},{"name":"veles-postgres-snapshot","isGroup":false}]' +reclaimPolicy: Retain +allowVolumeExpansion: true +volumeBindingMode: WaitForFirstConsumer diff --git a/infrastructure/modules/base/storageclass/veles-oceanus-policy.yaml b/infrastructure/modules/base/storageclass/veles-oceanus-policy.yaml new file mode 100644 index 00000000..8f616829 --- /dev/null +++ b/infrastructure/modules/base/storageclass/veles-oceanus-policy.yaml @@ -0,0 +1,25 @@ +# infrastructure/modules/base/storageclass/veles-oceanus-policy.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicy +metadata: + name: veles-oceanus-storage-namespace +spec: + failurePolicy: Fail + matchConstraints: + resourceRules: + - apiGroups: [""] + apiVersions: ["v1"] + operations: ["CREATE", "UPDATE"] + resources: ["persistentvolumeclaims"] + validations: + - expression: "!has(object.spec.storageClassName) || !(object.spec.storageClassName in ['veles-oceanus-db', 'veles-oceanus-artifacts']) || object.metadata.namespace == 'veles'" + message: "Veles Oceanus storage classes are reserved for namespace veles" +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: veles-oceanus-storage-namespace +spec: + policyName: veles-oceanus-storage-namespace + validationActions: + - Deny diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index 4b59de10..5598a420 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -429,6 +429,24 @@ data: } } } + pipelineJob('veles') { + disabled(true) + description('Staged Veles alpha image pipeline. Backend/frontend should build linux/amd64 and linux/arm64; sim-worker may begin amd64-only if Forge dependencies require it.') + definition { + cpsScm { + scm { + git { + remote { + url('https://scm.bstein.dev/bstein/veles.git') + credentials('gitea-pat') + } + branches('*/main') + } + } + scriptPath('Jenkinsfile') + } + } + } multibranchPipelineJob('titan-iac-quality-gate') { branchSources { branchSource { diff --git a/services/keycloak/kustomization.yaml b/services/keycloak/kustomization.yaml index d9213575..f635d2f8 100644 --- a/services/keycloak/kustomization.yaml +++ b/services/keycloak/kustomization.yaml @@ -27,6 +27,7 @@ resources: - oneoffs/soteria-oidc-secret-ensure-job.yaml - oneoffs/quality-oidc-secret-ensure-job.yaml - oneoffs/agent-oidc-secret-ensure-job.yaml + - oneoffs/veles-realm-ensure-job.yaml - oneoffs/metis-ssh-keys-secret-ensure-job.yaml - oneoffs/metis-node-passwords-secret-ensure-job.yaml - oneoffs/harbor-oidc-secret-ensure-job.yaml diff --git a/services/keycloak/oneoffs/veles-realm-ensure-job.yaml b/services/keycloak/oneoffs/veles-realm-ensure-job.yaml new file mode 100644 index 00000000..5309d315 --- /dev/null +++ b/services/keycloak/oneoffs/veles-realm-ensure-job.yaml @@ -0,0 +1,332 @@ +# services/keycloak/oneoffs/veles-realm-ensure-job.yaml +# One-off job for sso/veles-realm-ensure-1. +# Purpose: create the Veles realm, groups, OIDC client, SMTP settings, and Vault client secret. +# Keep suspended until Veles Vault paths/policies have reconciled, then unsuspend once. +apiVersion: batch/v1 +kind: Job +metadata: + name: veles-realm-ensure-1 + namespace: sso +spec: + suspend: true + backoffLimit: 0 + ttlSecondsAfterFinished: 3600 + template: + metadata: + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/agent-pre-populate-only: "true" + vault.hashicorp.com/role: "sso-secrets" + vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin" + vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: | + {{ with secret "kv/data/atlas/shared/keycloak-admin" }} + export KEYCLOAK_ADMIN="{{ .Data.data.username }}" + export KEYCLOAK_ADMIN_USER="{{ .Data.data.username }}" + export KEYCLOAK_ADMIN_PASSWORD="{{ .Data.data.password }}" + {{ end }} + {{ with secret "kv/data/atlas/shared/postmark-relay" }} + export KEYCLOAK_SMTP_USER="{{ index .Data.data "apikey" }}" + export KEYCLOAK_SMTP_PASSWORD="{{ index .Data.data "apikey" }}" + {{ end }} + spec: + serviceAccountName: mas-secrets-ensure + restartPolicy: Never + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: kubernetes.io/arch + operator: In + values: ["arm64"] + containers: + - name: configure + image: python:3.11-alpine + env: + - name: KEYCLOAK_SERVER + value: http://keycloak.sso.svc.cluster.local + - name: KEYCLOAK_REALM + value: veles + - name: KEYCLOAK_CLIENT_ID + value: veles-web + - name: KEYCLOAK_PUBLIC_ISSUER + value: https://sso.bstein.dev/realms/veles + - name: VELES_BASE_URL + value: https://veles.bstein.dev + - name: KEYCLOAK_SMTP_HOST + value: mail.bstein.dev + - name: KEYCLOAK_SMTP_PORT + value: "587" + - name: KEYCLOAK_SMTP_FROM + value: no-reply-veles@bstein.dev + - name: KEYCLOAK_SMTP_FROM_NAME + value: Veles + command: ["/bin/sh", "-c"] + args: + - | + set -eu + . /vault/secrets/keycloak-admin-env.sh + python - <<'PY' + import json + import os + import time + import urllib.error + import urllib.parse + import urllib.request + + base_url = os.environ["KEYCLOAK_SERVER"].rstrip("/") + realm = os.environ["KEYCLOAK_REALM"] + client_id = os.environ["KEYCLOAK_CLIENT_ID"] + issuer = os.environ["KEYCLOAK_PUBLIC_ISSUER"] + veles_base_url = os.environ["VELES_BASE_URL"].rstrip("/") + admin_user = os.environ["KEYCLOAK_ADMIN_USER"] + admin_password = os.environ["KEYCLOAK_ADMIN_PASSWORD"] + + def request(method, url, token=None, payload=None, headers=None, timeout=30): + data = None + req_headers = headers.copy() if headers else {} + if token: + req_headers["Authorization"] = f"Bearer {token}" + if payload is not None: + data = json.dumps(payload).encode() + req_headers["Content-Type"] = "application/json" + req = urllib.request.Request(url, data=data, headers=req_headers, method=method) + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + body = resp.read() + if not body: + return resp.status, None + return resp.status, json.loads(body.decode()) + except urllib.error.HTTPError as exc: + raw = exc.read() + if not raw: + return exc.code, None + try: + return exc.code, json.loads(raw.decode()) + except Exception: + return exc.code, {"raw": raw.decode(errors="replace")} + + token_body = None + form = urllib.parse.urlencode( + { + "grant_type": "password", + "client_id": "admin-cli", + "username": admin_user, + "password": admin_password, + } + ).encode() + for attempt in range(1, 11): + req = urllib.request.Request( + f"{base_url}/realms/master/protocol/openid-connect/token", + data=form, + headers={"Content-Type": "application/x-www-form-urlencoded"}, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=10) as resp: + token_body = json.loads(resp.read().decode()) + break + except urllib.error.URLError as exc: + if attempt == 10: + raise SystemExit(f"Keycloak token request failed after retries: {exc}") + time.sleep(attempt * 2) + token = token_body["access_token"] + + smtp = { + "host": os.environ["KEYCLOAK_SMTP_HOST"], + "port": os.environ["KEYCLOAK_SMTP_PORT"], + "from": os.environ["KEYCLOAK_SMTP_FROM"], + "fromDisplayName": os.environ["KEYCLOAK_SMTP_FROM_NAME"], + "replyTo": os.environ["KEYCLOAK_SMTP_FROM"], + "replyToDisplayName": os.environ["KEYCLOAK_SMTP_FROM_NAME"], + "user": os.environ["KEYCLOAK_SMTP_USER"], + "password": os.environ["KEYCLOAK_SMTP_PASSWORD"], + "auth": "true", + "starttls": "true", + "ssl": "false", + } + + status, realm_rep = request("GET", f"{base_url}/admin/realms/{realm}", token) + if status == 404: + create_payload = { + "realm": realm, + "enabled": True, + "registrationAllowed": False, + "resetPasswordAllowed": True, + "verifyEmail": True, + "loginWithEmailAllowed": True, + "duplicateEmailsAllowed": False, + "smtpServer": smtp, + } + status, body = request("POST", f"{base_url}/admin/realms", token, create_payload) + if status not in (201, 204, 409): + raise SystemExit(f"Realm create failed: status={status} body={body}") + status, realm_rep = request("GET", f"{base_url}/admin/realms/{realm}", token) + if status != 200 or not isinstance(realm_rep, dict): + raise SystemExit(f"Realm fetch failed: status={status}") + + realm_rep.update( + { + "enabled": True, + "registrationAllowed": False, + "resetPasswordAllowed": True, + "verifyEmail": True, + "loginWithEmailAllowed": True, + "duplicateEmailsAllowed": False, + "smtpServer": smtp, + } + ) + status, body = request("PUT", f"{base_url}/admin/realms/{realm}", token, realm_rep) + if status not in (200, 204): + raise SystemExit(f"Realm update failed: status={status} body={body}") + + def ensure_group(name): + status, groups = request( + "GET", + f"{base_url}/admin/realms/{realm}/groups?search={urllib.parse.quote(name)}", + token, + ) + if status != 200: + raise SystemExit(f"Group search failed for {name}: status={status}") + if any(group.get("name") == name for group in groups or []): + return + status, body = request("POST", f"{base_url}/admin/realms/{realm}/groups", token, {"name": name}) + if status not in (201, 204, 409): + raise SystemExit(f"Group create failed for {name}: status={status} body={body}") + + ensure_group("alpha") + ensure_group("admin") + + status, clients = request( + "GET", + f"{base_url}/admin/realms/{realm}/clients?clientId={urllib.parse.quote(client_id)}", + token, + ) + if status != 200: + raise SystemExit(f"Client lookup failed: status={status}") + client_uuid = clients[0]["id"] if clients else None + client_payload = { + "clientId": client_id, + "enabled": True, + "protocol": "openid-connect", + "publicClient": False, + "standardFlowEnabled": True, + "implicitFlowEnabled": False, + "directAccessGrantsEnabled": False, + "serviceAccountsEnabled": False, + "redirectUris": [f"{veles_base_url}/*"], + "webOrigins": [veles_base_url], + "rootUrl": veles_base_url, + "baseUrl": "/", + "attributes": { + "pkce.code.challenge.method": "S256", + "post.logout.redirect.uris": f"{veles_base_url}/*", + }, + } + if not client_uuid: + status, body = request("POST", f"{base_url}/admin/realms/{realm}/clients", token, client_payload) + if status not in (201, 204, 409): + raise SystemExit(f"Client create failed: status={status} body={body}") + status, clients = request( + "GET", + f"{base_url}/admin/realms/{realm}/clients?clientId={urllib.parse.quote(client_id)}", + token, + ) + client_uuid = clients[0]["id"] if clients else None + if not client_uuid: + raise SystemExit("Client veles-web not found after create") + status, body = request( + "PUT", + f"{base_url}/admin/realms/{realm}/clients/{client_uuid}", + token, + client_payload, + ) + if status not in (200, 204): + raise SystemExit(f"Client update failed: status={status} body={body}") + + mapper_payload = { + "name": "groups", + "protocol": "openid-connect", + "protocolMapper": "oidc-group-membership-mapper", + "consentRequired": False, + "config": { + "full.path": "false", + "id.token.claim": "true", + "access.token.claim": "true", + "userinfo.token.claim": "true", + "claim.name": "groups", + }, + } + status, mappers = request( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/protocol-mappers/models", + token, + ) + if status != 200: + raise SystemExit(f"Mapper lookup failed: status={status}") + mapper_id = next((mapper.get("id") for mapper in mappers or [] if mapper.get("name") == "groups"), None) + if mapper_id: + status, body = request( + "PUT", + f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/protocol-mappers/models/{mapper_id}", + token, + mapper_payload, + ) + else: + status, body = request( + "POST", + f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/protocol-mappers/models", + token, + mapper_payload, + ) + if status not in (200, 201, 204): + raise SystemExit(f"Mapper ensure failed: status={status} body={body}") + + status, secret = request( + "GET", + f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/client-secret", + token, + ) + client_secret = (secret or {}).get("value") + if status != 200 or not client_secret: + raise SystemExit(f"Client secret fetch failed: status={status}") + + vault_addr = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200") + jwt = open("/var/run/secrets/kubernetes.io/serviceaccount/token", encoding="utf-8").read().strip() + login_payload = json.dumps({"jwt": jwt, "role": os.environ.get("VAULT_ROLE", "sso-secrets")}).encode() + req = urllib.request.Request( + f"{vault_addr}/v1/auth/kubernetes/login", + data=login_payload, + headers={"Content-Type": "application/json"}, + method="POST", + ) + with urllib.request.urlopen(req, timeout=20) as resp: + vault_token = json.loads(resp.read().decode())["auth"]["client_token"] + + payload = { + "data": { + "client_id": client_id, + "client_secret": client_secret, + "issuer": issuer, + "realm": realm, + "required_groups": "alpha,admin", + } + } + req = urllib.request.Request( + f"{vault_addr}/v1/kv/data/atlas/veles/veles-oidc", + data=json.dumps(payload).encode(), + headers={"X-Vault-Token": vault_token, "Content-Type": "application/json"}, + method="POST", + ) + with urllib.request.urlopen(req, timeout=20) as resp: + if resp.status not in (200, 204): + raise SystemExit(f"Vault write returned {resp.status}") + + print("Veles Keycloak realm/client ready") + PY diff --git a/services/maintenance/metis-configmap.yaml b/services/maintenance/metis-configmap.yaml index f1974ca2..b01f298e 100644 --- a/services/maintenance/metis-configmap.yaml +++ b/services/maintenance/metis-configmap.yaml @@ -9,7 +9,7 @@ data: METIS_INVENTORY_PATH: /app/inventory.titan-rpi4.yaml METIS_DATA_DIR: /var/lib/metis METIS_DEFAULT_FLASH_HOST: titan-20 - METIS_FLASH_HOSTS: titan-20,titan-21,titan-22,titan-24,titan-19,titan-17,titan-15,titan-14,titan-12,titan-11,titan-10,titan-09,titan-08,titan-07,titan-06,titan-05,titan-04,titan-0c,titan-0b,titan-0a + METIS_FLASH_HOSTS: titan-20,titan-21,titan-22,titan-23,titan-24,titan-19,titan-17,titan-15,titan-14,titan-12,titan-11,titan-10,titan-09,titan-08,titan-07,titan-06,titan-05,titan-04,titan-0c,titan-0b,titan-0a METIS_LOCAL_HOST: titan-20 METIS_ALLOWED_GROUPS: admin,maintenance METIS_MAX_DEVICE_BYTES: "1000000000000" diff --git a/services/monitoring/dcgm-exporter.yaml b/services/monitoring/dcgm-exporter.yaml index ff5aed5a..e8c3b03d 100644 --- a/services/monitoring/dcgm-exporter.yaml +++ b/services/monitoring/dcgm-exporter.yaml @@ -38,6 +38,12 @@ spec: operator: NotIn values: - "true" + - key: veles.bstein.dev/node-pool + operator: NotIn + values: + - oceanus + - key: node-role.kubernetes.io/accelerator + operator: Exists tolerations: - operator: Exists containers: diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index c25e6dd4..e3ffdbc5 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -50,6 +50,15 @@ spec: upgrade: disableWait: true values: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: veles.bstein.dev/node-pool + operator: NotIn + values: + - oceanus rbac: pspEnabled: false service: diff --git a/services/vault/scripts/vault_k8s_auth_configure.sh b/services/vault/scripts/vault_k8s_auth_configure.sh index adb72748..2f9ae177 100644 --- a/services/vault/scripts/vault_k8s_auth_configure.sh +++ b/services/vault/scripts/vault_k8s_auth_configure.sh @@ -240,6 +240,11 @@ write_policy_and_role "game-stream" "game-stream" "game-stream-vault" \ "game-stream/*" "" write_policy_and_role "openclaw" "openclaw" "agent-vault" \ "openclaw/*" "" +write_policy_and_role "veles" "veles" "veles-backend,veles-postgres,veles-vault-sync" \ + "veles/* shared/harbor-pull shared/postmark-relay" "" +write_policy_and_role "veles-secrets" "veles" "veles-secrets-ensure" \ + "shared/postmark-relay" \ + "veles/*" write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync,metis" \ "maintenance/ariadne-db maintenance/metis-oidc maintenance/soteria-oidc maintenance/metis-ssh-keys maintenance/metis-runtime portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull shared/soteria-restic harbor/harbor-core" "" \ ' @@ -266,8 +271,8 @@ write_policy_and_role "vault" "vault" "vault" \ "vault/*" "" write_policy_and_role "sso-secrets" "sso" "mas-secrets-ensure" \ - "shared/keycloak-admin maintenance/metis-ssh-keys" \ - "harbor/harbor-oidc vault/vault-oidc-config comms/synapse-oidc logging/oauth2-proxy-logs-oidc finance/actual-oidc maintenance/metis-oidc maintenance/soteria-oidc maintenance/metis-ssh-keys openclaw/agent-oidc" \ + "shared/keycloak-admin shared/postmark-relay maintenance/metis-ssh-keys" \ + "harbor/harbor-oidc vault/vault-oidc-config comms/synapse-oidc logging/oauth2-proxy-logs-oidc finance/actual-oidc maintenance/metis-oidc maintenance/soteria-oidc maintenance/metis-ssh-keys openclaw/agent-oidc veles/veles-oidc" \ ' path "kv/data/atlas/nodes/*" { capabilities = ["create", "update", "read"] diff --git a/services/veles/NOTES.md b/services/veles/NOTES.md new file mode 100644 index 00000000..1b8c588a --- /dev/null +++ b/services/veles/NOTES.md @@ -0,0 +1,64 @@ +# Veles Infrastructure Contract + +This stack is staged for Flux and intentionally starts the app deployments at `replicas: 0` until images and the app-side runtime contract are ready. + +## Cluster Contract + +- Namespace: `veles` +- Hostname: `https://veles.bstein.dev` +- Namespace: `veles`; no alternate alpha namespace is used. +- Backend service: `veles-backend.veles.svc.cluster.local:80` +- Frontend service: `veles-frontend.veles.svc.cluster.local:80` +- Postgres service: `veles-postgres.veles.svc.cluster.local:5432` +- Artifact PVC: `veles-artifacts`, mounted at `/data/veles-artifacts` +- Storage classes: `veles-oceanus-db`, `veles-oceanus-artifacts` +- Images: + - `registry.bstein.dev/veles/veles-backend` + - `registry.bstein.dev/veles/veles-frontend` + - `registry.bstein.dev/veles/veles-sim-worker` + +## Runtime Env + +Veles should consume: + +- `VELES_PUBLIC_BASE_URL=https://veles.bstein.dev` +- `VELES_OIDC_ISSUER=https://sso.bstein.dev/realms/veles` +- `VELES_OIDC_CLIENT_ID=veles-web` +- `VELES_OIDC_REQUIRED_GROUPS=alpha,admin` +- `DATABASE_URL` from `kv/data/atlas/veles/veles-db` +- `VELES_SESSION_SECRET` from `kv/data/atlas/veles/app-secrets` +- `VELES_BYOK_ENCRYPTION_KEY` from `kv/data/atlas/veles/app-secrets` + +User OpenAI API keys must stay in the Veles database encrypted with `VELES_BYOK_ENCRYPTION_KEY`; do not store per-user BYOK secrets in Vault. + +## Simulation Jobs + +The backend service account can create, watch, and delete Jobs only inside the `veles` namespace. Simulation pods should use service account `veles-sim`, set `automountServiceAccountToken: false`, and use: + +```yaml +priorityClassName: veles-sim +nodeSelector: + veles.bstein.dev/simulation: "true" +tolerations: + - key: veles.bstein.dev/simulation + operator: Equal + value: "true" + effect: NoSchedule +``` + +## Staged Operator Steps + +1. Join `titan-23`/Oceanus to Atlas as a worker. +2. Use Metis with `titan-23` in `METIS_FLASH_HOSTS`; the existing node secret placeholder uses `192.168.22.23`. +3. Confirm the node normalizer applies the Veles labels and taint. +4. Add Oceanus Longhorn disks at paths tagged by the Longhorn tag ensure job. +5. Let Vault policy reconciliation run, then unsuspend `veles-secrets-ensure-1`. +6. Unsuspend `veles-realm-ensure-1` in `services/keycloak` to create the realm/client secret. +7. Create the Harbor `veles` project or robot access before image automation is enabled in production. +8. Scale `veles-postgres`, then backend/frontend once app images exist. + +## Assumptions + +- `veles-oceanus-artifacts` is RWO for alpha; simulation workers should either run on Oceanus with the backend or stream logs to the backend, which owns writes. +- Postgres uses Longhorn backup recurring jobs off Oceanus. This is not a substitute for a tested restore drill. +- The Jenkins job skeleton points at the Veles repo but stays disabled until that repo provides a Jenkinsfile. diff --git a/services/veles/artifacts-pvc.yaml b/services/veles/artifacts-pvc.yaml new file mode 100644 index 00000000..54904f13 --- /dev/null +++ b/services/veles/artifacts-pvc.yaml @@ -0,0 +1,16 @@ +# services/veles/artifacts-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: veles-artifacts + namespace: veles + labels: + app.kubernetes.io/name: veles + app.kubernetes.io/component: artifacts +spec: + accessModes: + - ReadWriteOnce + storageClassName: veles-oceanus-artifacts + resources: + requests: + storage: 200Gi diff --git a/services/veles/backend-deployment.yaml b/services/veles/backend-deployment.yaml new file mode 100644 index 00000000..47244ebf --- /dev/null +++ b/services/veles/backend-deployment.yaml @@ -0,0 +1,89 @@ +# services/veles/backend-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: veles-backend + namespace: veles + labels: + app: veles-backend +spec: + replicas: 0 + revisionHistoryLimit: 2 + selector: + matchLabels: + app: veles-backend + template: + metadata: + labels: + app: veles-backend + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/agent-pre-populate-only: "true" + vault.hashicorp.com/role: "veles" + vault.hashicorp.com/agent-inject-secret-veles-env.sh: "kv/data/atlas/veles/veles-db" + vault.hashicorp.com/agent-inject-template-veles-env.sh: | + {{- with secret "kv/data/atlas/veles/veles-db" }} + export DATABASE_URL="{{ .Data.data.DATABASE_URL }}" + export VELES_DATABASE_USER="{{ .Data.data.POSTGRES_USER }}" + export VELES_DATABASE_PASSWORD="{{ .Data.data.POSTGRES_PASSWORD }}" + {{- end }} + {{- with secret "kv/data/atlas/veles/veles-oidc" }} + export VELES_OIDC_CLIENT_SECRET="{{ .Data.data.client_secret }}" + {{- end }} + {{- with secret "kv/data/atlas/veles/app-secrets" }} + export VELES_SESSION_SECRET="{{ .Data.data.VELES_SESSION_SECRET }}" + export VELES_BYOK_ENCRYPTION_KEY="{{ .Data.data.VELES_BYOK_ENCRYPTION_KEY }}" + {{- end }} + spec: + serviceAccountName: veles-backend + priorityClassName: veles-core + nodeSelector: + veles.bstein.dev/node-pool: oceanus + tolerations: + - key: veles.bstein.dev/simulation + operator: Equal + value: "true" + effect: NoSchedule + securityContext: + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + containers: + - name: backend + image: registry.bstein.dev/veles/veles-backend:0.1.0-0 # {"$imagepolicy": "veles:veles-backend"} + imagePullPolicy: IfNotPresent + command: ["/bin/sh", "-c"] + args: + - | + if [ -f /vault/secrets/veles-env.sh ]; then + . /vault/secrets/veles-env.sh + fi + exec /app/veles-backend + ports: + - name: http + containerPort: 8080 + protocol: TCP + envFrom: + - configMapRef: + name: veles-app-config + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: "2" + memory: 4Gi + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: artifacts + mountPath: /data/veles-artifacts + volumes: + - name: artifacts + persistentVolumeClaim: + claimName: veles-artifacts diff --git a/services/veles/configmap.yaml b/services/veles/configmap.yaml new file mode 100644 index 00000000..3c34c2b9 --- /dev/null +++ b/services/veles/configmap.yaml @@ -0,0 +1,23 @@ +# services/veles/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: veles-app-config + namespace: veles +data: + VELES_ENV: alpha + VELES_PUBLIC_BASE_URL: https://veles.bstein.dev + VELES_OIDC_ISSUER: https://sso.bstein.dev/realms/veles + VELES_OIDC_CLIENT_ID: veles-web + VELES_OIDC_REQUIRED_GROUPS: alpha,admin + VELES_DATABASE_HOST: veles-postgres.veles.svc.cluster.local + VELES_DATABASE_PORT: "5432" + VELES_DATABASE_NAME: veles + VELES_ARTIFACTS_PATH: /data/veles-artifacts + VELES_SIM_NAMESPACE: veles + VELES_SIM_SERVICE_ACCOUNT: veles-sim + VELES_SIM_PRIORITY_CLASS: veles-sim + VELES_SIM_NODE_SELECTOR: veles.bstein.dev/simulation=true + VELES_SIM_TOLERATION_KEY: veles.bstein.dev/simulation + VELES_SIM_TOLERATION_VALUE: "true" + VELES_LOG_RETENTION_DAYS: "30" diff --git a/services/veles/frontend-deployment.yaml b/services/veles/frontend-deployment.yaml new file mode 100644 index 00000000..b4215e4c --- /dev/null +++ b/services/veles/frontend-deployment.yaml @@ -0,0 +1,72 @@ +# services/veles/frontend-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: veles-frontend + namespace: veles + labels: + app: veles-frontend +spec: + replicas: 0 + revisionHistoryLimit: 2 + selector: + matchLabels: + app: veles-frontend + template: + metadata: + labels: + app: veles-frontend + spec: + serviceAccountName: veles-frontend + priorityClassName: veles-core + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + - key: hardware + operator: In + values: ["rpi5", "rpi4", "amd64"] + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: atlas.bstein.dev/spillover + operator: DoesNotExist + - weight: 90 + preference: + matchExpressions: + - key: hardware + operator: In + values: ["rpi5"] + securityContext: + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + containers: + - name: frontend + image: registry.bstein.dev/veles/veles-frontend:0.1.0-0 # {"$imagepolicy": "veles:veles-frontend"} + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8080 + protocol: TCP + envFrom: + - configMapRef: + name: veles-app-config + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] diff --git a/services/veles/image.yaml b/services/veles/image.yaml new file mode 100644 index 00000000..f038856a --- /dev/null +++ b/services/veles/image.yaml @@ -0,0 +1,69 @@ +# services/veles/image.yaml +apiVersion: image.toolkit.fluxcd.io/v1 +kind: ImageRepository +metadata: + name: veles-backend + namespace: veles +spec: + image: registry.bstein.dev/veles/veles-backend + interval: 1m0s + secretRef: + name: harbor-regcred +--- +apiVersion: image.toolkit.fluxcd.io/v1 +kind: ImagePolicy +metadata: + name: veles-backend + namespace: veles +spec: + imageRepositoryRef: + name: veles-backend + policy: + semver: + range: ">=0.1.0-0" +--- +apiVersion: image.toolkit.fluxcd.io/v1 +kind: ImageRepository +metadata: + name: veles-frontend + namespace: veles +spec: + image: registry.bstein.dev/veles/veles-frontend + interval: 1m0s + secretRef: + name: harbor-regcred +--- +apiVersion: image.toolkit.fluxcd.io/v1 +kind: ImagePolicy +metadata: + name: veles-frontend + namespace: veles +spec: + imageRepositoryRef: + name: veles-frontend + policy: + semver: + range: ">=0.1.0-0" +--- +apiVersion: image.toolkit.fluxcd.io/v1 +kind: ImageRepository +metadata: + name: veles-sim-worker + namespace: veles +spec: + image: registry.bstein.dev/veles/veles-sim-worker + interval: 1m0s + secretRef: + name: harbor-regcred +--- +apiVersion: image.toolkit.fluxcd.io/v1 +kind: ImagePolicy +metadata: + name: veles-sim-worker + namespace: veles +spec: + imageRepositoryRef: + name: veles-sim-worker + policy: + semver: + range: ">=0.1.0-0" diff --git a/services/veles/ingress.yaml b/services/veles/ingress.yaml new file mode 100644 index 00000000..ae2597f7 --- /dev/null +++ b/services/veles/ingress.yaml @@ -0,0 +1,47 @@ +# services/veles/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: veles + namespace: veles + annotations: + cert-manager.io/cluster-issuer: letsencrypt + traefik.ingress.kubernetes.io/router.entrypoints: websecure + traefik.ingress.kubernetes.io/router.tls: "true" +spec: + ingressClassName: traefik + tls: + - hosts: ["veles.bstein.dev"] + secretName: veles-tls + rules: + - host: veles.bstein.dev + http: + paths: + - path: /api + pathType: Prefix + backend: + service: + name: veles-backend + port: + number: 80 + - path: /events + pathType: Prefix + backend: + service: + name: veles-backend + port: + number: 80 + - path: /ws + pathType: Prefix + backend: + service: + name: veles-backend + port: + number: 80 + - path: / + pathType: Prefix + backend: + service: + name: veles-frontend + port: + number: 80 diff --git a/services/veles/kustomization.yaml b/services/veles/kustomization.yaml new file mode 100644 index 00000000..68dbabb8 --- /dev/null +++ b/services/veles/kustomization.yaml @@ -0,0 +1,22 @@ +# services/veles/kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: veles +resources: + - namespace.yaml + - serviceaccounts.yaml + - secretproviderclass.yaml + - vault-sync-deployment.yaml + - resourcequota.yaml + - limitrange.yaml + - configmap.yaml + - rbac.yaml + - artifacts-pvc.yaml + - postgres-service.yaml + - postgres-statefulset.yaml + - services.yaml + - backend-deployment.yaml + - frontend-deployment.yaml + - image.yaml + - ingress.yaml + - oneoffs/veles-secrets-ensure-job.yaml diff --git a/services/veles/limitrange.yaml b/services/veles/limitrange.yaml new file mode 100644 index 00000000..f0e07365 --- /dev/null +++ b/services/veles/limitrange.yaml @@ -0,0 +1,21 @@ +# services/veles/limitrange.yaml +apiVersion: v1 +kind: LimitRange +metadata: + name: veles-container-limits + namespace: veles +spec: + limits: + - type: Container + defaultRequest: + cpu: 100m + memory: 256Mi + default: + cpu: 500m + memory: 512Mi + min: + cpu: 10m + memory: 32Mi + max: + cpu: "16" + memory: 32Gi diff --git a/services/veles/namespace.yaml b/services/veles/namespace.yaml new file mode 100644 index 00000000..c7c1859a --- /dev/null +++ b/services/veles/namespace.yaml @@ -0,0 +1,8 @@ +# services/veles/namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: veles + labels: + app.kubernetes.io/name: veles + app.kubernetes.io/part-of: veles diff --git a/services/veles/oneoffs/veles-secrets-ensure-job.yaml b/services/veles/oneoffs/veles-secrets-ensure-job.yaml new file mode 100644 index 00000000..ee447403 --- /dev/null +++ b/services/veles/oneoffs/veles-secrets-ensure-job.yaml @@ -0,0 +1,142 @@ +# services/veles/oneoffs/veles-secrets-ensure-job.yaml +# One-off job for veles/veles-secrets-ensure-1. +# Purpose: seed Veles Vault paths before app/Postgres pods are scaled up. +# Keep suspended until the veles Vault role has reconciled, then unsuspend once. +apiVersion: batch/v1 +kind: Job +metadata: + name: veles-secrets-ensure-1 + namespace: veles +spec: + suspend: true + backoffLimit: 0 + ttlSecondsAfterFinished: 3600 + template: + spec: + serviceAccountName: veles-secrets-ensure + restartPolicy: Never + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: kubernetes.io/arch + operator: In + values: ["arm64"] + containers: + - name: apply + image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + + vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}" + vault_role="${VAULT_ROLE:-veles-secrets}" + jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" + login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')" + vault_token="$(curl -sS --request POST --data "${login_payload}" \ + "${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')" + if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then + echo "vault login failed" >&2 + exit 1 + fi + + read_secret() { + path="$1" + out="$2" + curl -sS -o "${out}" -w "%{http_code}" \ + -H "X-Vault-Token: ${vault_token}" \ + "${vault_addr}/v1/kv/data/atlas/${path}" || true + } + + write_secret() { + path="$1" + payload="$2" + out="$(mktemp)" + status="$(curl -sS -o "${out}" -w "%{http_code}" -X POST \ + -H "X-Vault-Token: ${vault_token}" \ + -H "Content-Type: application/json" \ + -d "${payload}" \ + "${vault_addr}/v1/kv/data/atlas/${path}")" + if [ "${status}" != "200" ] && [ "${status}" != "204" ]; then + echo "Vault write failed for ${path} (status ${status})" >&2 + cat "${out}" >&2 || true + exit 1 + fi + } + + rand_b64() { + bytes="$1" + openssl rand -base64 "${bytes}" | tr -d '\n' + } + + status="$(read_secret veles/veles-db /tmp/veles-db.json)" + if [ "${status}" = "200" ]; then + db_password="$(jq -r '.data.data.POSTGRES_PASSWORD // empty' /tmp/veles-db.json)" + elif [ "${status}" = "404" ]; then + db_password="" + else + echo "Vault read failed for veles-db (status ${status})" >&2 + cat /tmp/veles-db.json >&2 || true + exit 1 + fi + if [ -z "${db_password}" ]; then + db_password="$(rand_b64 36)" + fi + db_payload="$(jq -nc \ + --arg host "veles-postgres.veles.svc.cluster.local" \ + --arg port "5432" \ + --arg db "veles" \ + --arg user "veles" \ + --arg password "${db_password}" \ + '{data:{POSTGRES_HOST:$host,POSTGRES_PORT:$port,POSTGRES_DB:$db,POSTGRES_USER:$user,POSTGRES_PASSWORD:$password,DATABASE_URL:("postgresql://"+$user+":"+$password+"@"+$host+":"+$port+"/"+$db+"?sslmode=disable")}}')" + write_secret veles/veles-db "${db_payload}" + + status="$(read_secret veles/app-secrets /tmp/app-secrets.json)" + if [ "${status}" = "200" ]; then + session_secret="$(jq -r '.data.data.VELES_SESSION_SECRET // empty' /tmp/app-secrets.json)" + byok_key="$(jq -r '.data.data.VELES_BYOK_ENCRYPTION_KEY // empty' /tmp/app-secrets.json)" + elif [ "${status}" = "404" ]; then + session_secret="" + byok_key="" + else + echo "Vault read failed for app-secrets (status ${status})" >&2 + cat /tmp/app-secrets.json >&2 || true + exit 1 + fi + if [ -z "${session_secret}" ]; then + session_secret="$(rand_b64 48)" + fi + if [ -z "${byok_key}" ]; then + byok_key="$(rand_b64 32)" + fi + app_payload="$(jq -nc \ + --arg session_secret "${session_secret}" \ + --arg byok_key "${byok_key}" \ + '{data:{VELES_SESSION_SECRET:$session_secret,VELES_BYOK_ENCRYPTION_KEY:$byok_key}}')" + write_secret veles/app-secrets "${app_payload}" + + postmark_status="$(read_secret shared/postmark-relay /tmp/postmark.json)" + if [ "${postmark_status}" = "200" ]; then + smtp_password="$(jq -r '.data.data.apikey // empty' /tmp/postmark.json)" + if [ -n "${smtp_password}" ]; then + smtp_payload="$(jq -nc \ + --arg host "mail.bstein.dev" \ + --arg port "587" \ + --arg user "${smtp_password}" \ + --arg password "${smtp_password}" \ + --arg from "no-reply-veles@bstein.dev" \ + --arg from_name "Veles" \ + '{data:{SMTP_HOST:$host,SMTP_PORT:$port,SMTP_USER:$user,SMTP_PASSWORD:$password,SMTP_FROM:$from,SMTP_FROM_NAME:$from_name,SMTP_STARTTLS:"true"}}')" + write_secret veles/smtp "${smtp_payload}" + fi + fi + + echo "Veles Vault paths ready: veles-db, app-secrets, smtp when Postmark relay exists" diff --git a/services/veles/postgres-service.yaml b/services/veles/postgres-service.yaml new file mode 100644 index 00000000..f805f0eb --- /dev/null +++ b/services/veles/postgres-service.yaml @@ -0,0 +1,17 @@ +# services/veles/postgres-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: veles-postgres + namespace: veles + labels: + app: veles-postgres +spec: + clusterIP: None + ports: + - name: postgres + port: 5432 + protocol: TCP + targetPort: 5432 + selector: + app: veles-postgres diff --git a/services/veles/postgres-statefulset.yaml b/services/veles/postgres-statefulset.yaml new file mode 100644 index 00000000..b13e05a4 --- /dev/null +++ b/services/veles/postgres-statefulset.yaml @@ -0,0 +1,88 @@ +# services/veles/postgres-statefulset.yaml +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: veles-postgres + namespace: veles + labels: + app: veles-postgres +spec: + serviceName: veles-postgres + replicas: 0 + selector: + matchLabels: + app: veles-postgres + persistentVolumeClaimRetentionPolicy: + whenDeleted: Retain + whenScaled: Retain + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + app: veles-postgres + annotations: + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/agent-pre-populate-only: "true" + vault.hashicorp.com/role: "veles" + vault.hashicorp.com/agent-inject-secret-postgres-password: "kv/data/atlas/veles/veles-db" + vault.hashicorp.com/agent-inject-template-postgres-password: | + {{- with secret "kv/data/atlas/veles/veles-db" -}} + {{ .Data.data.POSTGRES_PASSWORD }} + {{- end -}} + spec: + serviceAccountName: veles-postgres + priorityClassName: veles-core + nodeSelector: + veles.bstein.dev/node-pool: oceanus + tolerations: + - key: veles.bstein.dev/simulation + operator: Equal + value: "true" + effect: NoSchedule + securityContext: + fsGroup: 999 + seccompProfile: + type: RuntimeDefault + containers: + - name: postgres + image: postgres:15 + ports: + - name: postgres + containerPort: 5432 + protocol: TCP + env: + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + - name: POSTGRES_USER + value: veles + - name: POSTGRES_PASSWORD_FILE + value: /vault/secrets/postgres-password + - name: POSTGRES_DB + value: veles + resources: + requests: + cpu: "2" + memory: 8Gi + limits: + cpu: "4" + memory: 16Gi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: postgres-data + mountPath: /var/lib/postgresql/data + volumeClaimTemplates: + - metadata: + name: postgres-data + labels: + app: veles-postgres + veles.bstein.dev/backup: longhorn + spec: + accessModes: ["ReadWriteOnce"] + storageClassName: veles-oceanus-db + resources: + requests: + storage: 100Gi diff --git a/services/veles/rbac.yaml b/services/veles/rbac.yaml new file mode 100644 index 00000000..e65d6db0 --- /dev/null +++ b/services/veles/rbac.yaml @@ -0,0 +1,36 @@ +# services/veles/rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: veles-backend-jobs + namespace: veles +rules: + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "delete", "deletecollection", "get", "list", "patch", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["delete", "get", "list", "watch"] + - apiGroups: [""] + resources: ["pods/log"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["events"] + verbs: ["get", "list", "watch"] + - apiGroups: ["events.k8s.io"] + resources: ["events"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: veles-backend-jobs + namespace: veles +subjects: + - kind: ServiceAccount + name: veles-backend + namespace: veles +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: veles-backend-jobs diff --git a/services/veles/resourcequota.yaml b/services/veles/resourcequota.yaml new file mode 100644 index 00000000..6591a916 --- /dev/null +++ b/services/veles/resourcequota.yaml @@ -0,0 +1,54 @@ +# services/veles/resourcequota.yaml +apiVersion: v1 +kind: ResourceQuota +metadata: + name: veles-namespace-quota + namespace: veles +spec: + hard: + requests.cpu: "12" + requests.memory: 24Gi + limits.cpu: "40" + limits.memory: 96Gi + pods: "60" + count/jobs.batch: "100" + persistentvolumeclaims: "8" + requests.storage: 300Gi +--- +apiVersion: v1 +kind: ResourceQuota +metadata: + name: veles-core-quota + namespace: veles +spec: + hard: + requests.cpu: "4" + requests.memory: 12Gi + limits.cpu: "8" + limits.memory: 24Gi + pods: "12" + scopeSelector: + matchExpressions: + - scopeName: PriorityClass + operator: In + values: + - veles-core +--- +apiVersion: v1 +kind: ResourceQuota +metadata: + name: veles-sim-quota + namespace: veles +spec: + hard: + requests.cpu: "8" + requests.memory: 16Gi + limits.cpu: "32" + limits.memory: 72Gi + pods: "48" + scopeSelector: + matchExpressions: + - scopeName: PriorityClass + operator: In + values: + - veles-sim diff --git a/services/veles/secretproviderclass.yaml b/services/veles/secretproviderclass.yaml new file mode 100644 index 00000000..a3069963 --- /dev/null +++ b/services/veles/secretproviderclass.yaml @@ -0,0 +1,21 @@ +# services/veles/secretproviderclass.yaml +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: veles-vault + namespace: veles +spec: + provider: vault + parameters: + vaultAddress: "http://vault.vault.svc.cluster.local:8200" + roleName: "veles" + objects: | + - objectName: "harbor-pull__dockerconfigjson" + secretPath: "kv/data/atlas/shared/harbor-pull" + secretKey: "dockerconfigjson" + secretObjects: + - secretName: harbor-regcred + type: kubernetes.io/dockerconfigjson + data: + - objectName: harbor-pull__dockerconfigjson + key: .dockerconfigjson diff --git a/services/veles/serviceaccounts.yaml b/services/veles/serviceaccounts.yaml new file mode 100644 index 00000000..e0b79580 --- /dev/null +++ b/services/veles/serviceaccounts.yaml @@ -0,0 +1,45 @@ +# services/veles/serviceaccounts.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: veles-backend + namespace: veles +imagePullSecrets: + - name: harbor-regcred +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: veles-frontend + namespace: veles +imagePullSecrets: + - name: harbor-regcred +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: veles-postgres + namespace: veles +imagePullSecrets: + - name: harbor-regcred +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: veles-vault-sync + namespace: veles +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: veles-secrets-ensure + namespace: veles +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: veles-sim + namespace: veles +automountServiceAccountToken: false +imagePullSecrets: + - name: harbor-regcred diff --git a/services/veles/services.yaml b/services/veles/services.yaml new file mode 100644 index 00000000..6cd670e4 --- /dev/null +++ b/services/veles/services.yaml @@ -0,0 +1,32 @@ +# services/veles/services.yaml +apiVersion: v1 +kind: Service +metadata: + name: veles-backend + namespace: veles + labels: + app: veles-backend +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 8080 + selector: + app: veles-backend +--- +apiVersion: v1 +kind: Service +metadata: + name: veles-frontend + namespace: veles + labels: + app: veles-frontend +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 8080 + selector: + app: veles-frontend diff --git a/services/veles/vault-sync-deployment.yaml b/services/veles/vault-sync-deployment.yaml new file mode 100644 index 00000000..dfba5310 --- /dev/null +++ b/services/veles/vault-sync-deployment.yaml @@ -0,0 +1,43 @@ +# services/veles/vault-sync-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: veles-vault-sync + namespace: veles + labels: + app: veles-vault-sync +spec: + replicas: 1 + selector: + matchLabels: + app: veles-vault-sync + template: + metadata: + labels: + app: veles-vault-sync + spec: + serviceAccountName: veles-vault-sync + containers: + - name: sync + image: alpine:3.20 + command: ["/bin/sh", "-c"] + args: + - "sleep infinity" + resources: + requests: + cpu: 10m + memory: 16Mi + limits: + cpu: 50m + memory: 64Mi + volumeMounts: + - name: vault-secrets + mountPath: /vault/secrets + readOnly: true + volumes: + - name: vault-secrets + csi: + driver: secrets-store.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: veles-vault