veles: stage atlas infrastructure

This commit is contained in:
jenkins 2026-06-09 00:46:46 -03:00
parent e1d091eb14
commit 654900b8a2
41 changed files with 1562 additions and 14 deletions

View File

@ -28,6 +28,7 @@ resources:
- ai-llm/kustomization.yaml
- openclaw/kustomization.yaml
- game-stream/kustomization.yaml
- veles/kustomization.yaml
- typhon/kustomization.yaml
- nextcloud/kustomization.yaml
- nextcloud-mail-sync/kustomization.yaml

View File

@ -0,0 +1,29 @@
# clusters/atlas/flux-system/applications/veles/image-automation.yaml
# Staged for the first Veles image rollout. Add this file to the parent
# applications kustomization after the namespace exists and the Harbor repos
# have initial tags.
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: veles
namespace: veles
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: main
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(veles): automated image update"
push:
branch: main
update:
strategy: Setters
path: services/veles

View File

@ -0,0 +1,28 @@
# clusters/atlas/flux-system/applications/veles/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: veles
namespace: flux-system
annotations:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
path: ./services/veles
targetNamespace: veles
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: cert-manager
- name: core
- name: keycloak
- name: longhorn
- name: traefik
- name: vault
- name: vault-csi
- name: vault-injector
wait: false
timeout: 20m

View File

@ -55,6 +55,20 @@ spec:
k label node titan-22 atlas.bstein.dev/general-compute=last-resort --overwrite=true || true
fi
if k get node titan-23 >/dev/null 2>&1; then
k label node titan-23 \
veles.bstein.dev/simulation=true \
veles.bstein.dev/node-pool=oceanus \
node-role.kubernetes.io/veles-sim=true \
longhorn-host=true \
hardware=oceanus \
--overwrite=true || true
k label node titan-23 node-role.kubernetes.io/worker- || true
k taint node titan-23 veles.bstein.dev/simulation=true:NoSchedule --overwrite=true || true
else
echo "skipping missing node titan-23"
fi
for node in titan-13 titan-15 titan-17 titan-19; do
if k get node "${node}" >/dev/null 2>&1; then
k label node "${node}" atlas.bstein.dev/spillover=true longhorn-host=true --overwrite=true || true

View File

@ -81,7 +81,13 @@ spec:
tag: v2.16.0
defaultSettings:
systemManagedPodsImagePullPolicy: Always
taintToleration: veles.bstein.dev/simulation=true:NoSchedule
longhornManager:
tolerations:
- key: veles.bstein.dev/simulation
operator: Equal
value: "true"
effect: NoSchedule
nodeSelector:
longhorn-host: "true"
longhornDriver:

View File

@ -7,6 +7,7 @@ resources:
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- helmrelease.yaml
- veles-recurring-jobs.yaml
- longhorn-settings-ensure-job.yaml
- longhorn-disk-tags-ensure-job.yaml

View File

@ -2,7 +2,7 @@
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-disk-tags-ensure-1
name: longhorn-disk-tags-ensure-3
namespace: longhorn-system
spec:
backoffLimit: 0

View File

@ -17,10 +17,28 @@ import urllib.request
LONGHORN_NS = "longhorn-system"
LONGHORN_API = "/apis/longhorn.io/v1beta2/namespaces/{namespace}/nodes"
DESIRED_TAGS = {
"/mnt/astreae": "astreae",
"/mnt/asteria": "asteria",
DESIRED_DISK_TAGS = {
"/mnt/astreae": ["astreae"],
"/mnt/asteria": ["asteria"],
"/mnt/veles": ["veles-oceanus", "veles-db", "veles-artifacts"],
"/mnt/veles-db": ["veles-oceanus", "veles-db"],
"/mnt/veles-artifacts": ["veles-oceanus", "veles-artifacts"],
}
DESIRED_NODE_TAGS = {
"titan-23": ["veles-oceanus"],
}
DESIRED_NODE_DISKS = {
"titan-23": {
"veles-oceanus": {
"path": "/mnt/veles",
"allowScheduling": True,
"evictionRequested": False,
"storageReserved": 0,
"tags": ["veles-oceanus", "veles-db", "veles-artifacts"],
}
}
}
DISABLE_DEFAULT_DISK_NODES = {"titan-23"}
def api_base() -> str:
@ -63,8 +81,30 @@ def list_nodes() -> list[dict]:
return data.get("items", [])
def patch_disk_tags(node_name: str, disk_name: str, desired_tag: str) -> None:
body = {"spec": {"disks": {disk_name: {"tags": [desired_tag]}}}}
def merged_tags(current_tags: list[str], desired_tags: list[str]) -> list[str]:
return sorted(dict.fromkeys([*current_tags, *desired_tags]))
def patch_node_tags(node_name: str, desired_tags: list[str]) -> None:
body = {"spec": {"tags": desired_tags}}
request_json(
"PATCH",
f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}",
body=body,
)
def patch_disk_tags(node_name: str, disk_name: str, desired_tags: list[str]) -> None:
body = {"spec": {"disks": {disk_name: {"tags": desired_tags}}}}
request_json(
"PATCH",
f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}",
body=body,
)
def patch_disks(node_name: str, disks: dict) -> None:
body = {"spec": {"disks": disks}}
request_json(
"PATCH",
f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}",
@ -78,18 +118,52 @@ def main() -> int:
for node in list_nodes():
name = node.get("metadata", {}).get("name", "")
desired_node_tags = DESIRED_NODE_TAGS.get(name)
if desired_node_tags:
current_node_tags = node.get("spec", {}).get("tags") or []
next_node_tags = merged_tags(current_node_tags, desired_node_tags)
if current_node_tags != next_node_tags:
print(f"patching {name} node tags={current_node_tags!r} -> {next_node_tags!r}")
patch_node_tags(name, next_node_tags)
changed += 1
else:
skipped += 1
spec_disks = node.get("spec", {}).get("disks", {}) or {}
desired_disks = DESIRED_NODE_DISKS.get(name, {})
missing_disks = {
disk_name: disk_spec
for disk_name, disk_spec in desired_disks.items()
if disk_name not in spec_disks
}
if missing_disks:
print(f"adding {name} disks={sorted(missing_disks)}")
patch_disks(name, missing_disks)
changed += len(missing_disks)
spec_disks = {**spec_disks, **missing_disks}
if name in DISABLE_DEFAULT_DISK_NODES:
disable_patch = {}
for disk_name, disk in spec_disks.items():
disk_path = (disk.get("path") or "").rstrip("/")
if disk_path == "/var/lib/longhorn" and disk.get("allowScheduling", True):
disable_patch[disk_name] = {"allowScheduling": False}
if disable_patch:
print(f"disabling default Longhorn scheduling on {name} disks={sorted(disable_patch)}")
patch_disks(name, disable_patch)
changed += len(disable_patch)
for disk_name, disk in spec_disks.items():
disk_path = disk.get("path")
desired_tag = DESIRED_TAGS.get(disk_path)
if not desired_tag:
desired_disk_tags = DESIRED_DISK_TAGS.get(disk_path)
if not desired_disk_tags:
continue
current_tags = disk.get("tags") or []
if current_tags == [desired_tag]:
if current_tags == desired_disk_tags:
skipped += 1
continue
print(f"patching {name}:{disk_name} path={disk_path} tags={current_tags!r} -> {[desired_tag]!r}")
patch_disk_tags(name, disk_name, desired_tag)
print(f"patching {name}:{disk_name} path={disk_path} tags={current_tags!r} -> {desired_disk_tags!r}")
patch_disk_tags(name, disk_name, desired_disk_tags)
changed += 1
print(f"done: changed={changed} skipped={skipped}")

View File

@ -0,0 +1,28 @@
# infrastructure/longhorn/core/veles-recurring-jobs.yaml
apiVersion: longhorn.io/v1beta2
kind: RecurringJob
metadata:
name: veles-postgres-backup
namespace: longhorn-system
spec:
cron: "30 5 * * *"
task: backup
groups:
- veles
- veles-postgres
retain: 7
concurrency: 1
---
apiVersion: longhorn.io/v1beta2
kind: RecurringJob
metadata:
name: veles-postgres-snapshot
namespace: longhorn-system
spec:
cron: "*/30 * * * *"
task: snapshot
groups:
- veles
- veles-postgres
retain: 8
concurrency: 1

View File

@ -3,3 +3,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- scavenger.yaml
- veles.yaml

View File

@ -0,0 +1,17 @@
# infrastructure/modules/base/priorityclass/veles.yaml
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: veles-core
value: 500
globalDefault: false
description: "For Veles core database, API, and controller workloads"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: veles-sim
value: 50
globalDefault: false
preemptionPolicy: Never
description: "For Veles simulation jobs; lower than core and non-preempting"

View File

@ -5,3 +5,6 @@ resources:
- asteria.yaml
- asteria-encrypted.yaml
- astreae.yaml
- veles-oceanus-db.yaml
- veles-oceanus-artifacts.yaml
- veles-oceanus-policy.yaml

View File

@ -0,0 +1,20 @@
# infrastructure/modules/base/storageclass/veles-oceanus-artifacts.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: veles-oceanus-artifacts
annotations:
veles.bstein.dev/allowed-namespace: veles
provisioner: driver.longhorn.io
parameters:
nodeSelector: veles-oceanus
diskSelector: veles-oceanus,veles-artifacts
fromBackup: ""
numberOfReplicas: "1"
staleReplicaTimeout: "30"
fsType: ext4
replicaAutoBalance: disabled
dataLocality: strict-local
reclaimPolicy: Retain
allowVolumeExpansion: true
volumeBindingMode: WaitForFirstConsumer

View File

@ -0,0 +1,21 @@
# infrastructure/modules/base/storageclass/veles-oceanus-db.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: veles-oceanus-db
annotations:
veles.bstein.dev/allowed-namespace: veles
provisioner: driver.longhorn.io
parameters:
nodeSelector: veles-oceanus
diskSelector: veles-oceanus,veles-db
fromBackup: ""
numberOfReplicas: "1"
staleReplicaTimeout: "30"
fsType: ext4
replicaAutoBalance: disabled
dataLocality: strict-local
recurringJobSelector: '[{"name":"veles-postgres-backup","isGroup":false},{"name":"veles-postgres-snapshot","isGroup":false}]'
reclaimPolicy: Retain
allowVolumeExpansion: true
volumeBindingMode: WaitForFirstConsumer

View File

@ -0,0 +1,25 @@
# infrastructure/modules/base/storageclass/veles-oceanus-policy.yaml
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicy
metadata:
name: veles-oceanus-storage-namespace
spec:
failurePolicy: Fail
matchConstraints:
resourceRules:
- apiGroups: [""]
apiVersions: ["v1"]
operations: ["CREATE", "UPDATE"]
resources: ["persistentvolumeclaims"]
validations:
- expression: "!has(object.spec.storageClassName) || !(object.spec.storageClassName in ['veles-oceanus-db', 'veles-oceanus-artifacts']) || object.metadata.namespace == 'veles'"
message: "Veles Oceanus storage classes are reserved for namespace veles"
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicyBinding
metadata:
name: veles-oceanus-storage-namespace
spec:
policyName: veles-oceanus-storage-namespace
validationActions:
- Deny

View File

@ -429,6 +429,24 @@ data:
}
}
}
pipelineJob('veles') {
disabled(true)
description('Staged Veles alpha image pipeline. Backend/frontend should build linux/amd64 and linux/arm64; sim-worker may begin amd64-only if Forge dependencies require it.')
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/veles.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
multibranchPipelineJob('titan-iac-quality-gate') {
branchSources {
branchSource {

View File

@ -27,6 +27,7 @@ resources:
- oneoffs/soteria-oidc-secret-ensure-job.yaml
- oneoffs/quality-oidc-secret-ensure-job.yaml
- oneoffs/agent-oidc-secret-ensure-job.yaml
- oneoffs/veles-realm-ensure-job.yaml
- oneoffs/metis-ssh-keys-secret-ensure-job.yaml
- oneoffs/metis-node-passwords-secret-ensure-job.yaml
- oneoffs/harbor-oidc-secret-ensure-job.yaml

View File

@ -0,0 +1,332 @@
# services/keycloak/oneoffs/veles-realm-ensure-job.yaml
# One-off job for sso/veles-realm-ensure-1.
# Purpose: create the Veles realm, groups, OIDC client, SMTP settings, and Vault client secret.
# Keep suspended until Veles Vault paths/policies have reconciled, then unsuspend once.
apiVersion: batch/v1
kind: Job
metadata:
name: veles-realm-ensure-1
namespace: sso
spec:
suspend: true
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
metadata:
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "sso-secrets"
vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin"
vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: |
{{ with secret "kv/data/atlas/shared/keycloak-admin" }}
export KEYCLOAK_ADMIN="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_USER="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_PASSWORD="{{ .Data.data.password }}"
{{ end }}
{{ with secret "kv/data/atlas/shared/postmark-relay" }}
export KEYCLOAK_SMTP_USER="{{ index .Data.data "apikey" }}"
export KEYCLOAK_SMTP_PASSWORD="{{ index .Data.data "apikey" }}"
{{ end }}
spec:
serviceAccountName: mas-secrets-ensure
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: configure
image: python:3.11-alpine
env:
- name: KEYCLOAK_SERVER
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: veles
- name: KEYCLOAK_CLIENT_ID
value: veles-web
- name: KEYCLOAK_PUBLIC_ISSUER
value: https://sso.bstein.dev/realms/veles
- name: VELES_BASE_URL
value: https://veles.bstein.dev
- name: KEYCLOAK_SMTP_HOST
value: mail.bstein.dev
- name: KEYCLOAK_SMTP_PORT
value: "587"
- name: KEYCLOAK_SMTP_FROM
value: no-reply-veles@bstein.dev
- name: KEYCLOAK_SMTP_FROM_NAME
value: Veles
command: ["/bin/sh", "-c"]
args:
- |
set -eu
. /vault/secrets/keycloak-admin-env.sh
python - <<'PY'
import json
import os
import time
import urllib.error
import urllib.parse
import urllib.request
base_url = os.environ["KEYCLOAK_SERVER"].rstrip("/")
realm = os.environ["KEYCLOAK_REALM"]
client_id = os.environ["KEYCLOAK_CLIENT_ID"]
issuer = os.environ["KEYCLOAK_PUBLIC_ISSUER"]
veles_base_url = os.environ["VELES_BASE_URL"].rstrip("/")
admin_user = os.environ["KEYCLOAK_ADMIN_USER"]
admin_password = os.environ["KEYCLOAK_ADMIN_PASSWORD"]
def request(method, url, token=None, payload=None, headers=None, timeout=30):
data = None
req_headers = headers.copy() if headers else {}
if token:
req_headers["Authorization"] = f"Bearer {token}"
if payload is not None:
data = json.dumps(payload).encode()
req_headers["Content-Type"] = "application/json"
req = urllib.request.Request(url, data=data, headers=req_headers, method=method)
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
body = resp.read()
if not body:
return resp.status, None
return resp.status, json.loads(body.decode())
except urllib.error.HTTPError as exc:
raw = exc.read()
if not raw:
return exc.code, None
try:
return exc.code, json.loads(raw.decode())
except Exception:
return exc.code, {"raw": raw.decode(errors="replace")}
token_body = None
form = urllib.parse.urlencode(
{
"grant_type": "password",
"client_id": "admin-cli",
"username": admin_user,
"password": admin_password,
}
).encode()
for attempt in range(1, 11):
req = urllib.request.Request(
f"{base_url}/realms/master/protocol/openid-connect/token",
data=form,
headers={"Content-Type": "application/x-www-form-urlencoded"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=10) as resp:
token_body = json.loads(resp.read().decode())
break
except urllib.error.URLError as exc:
if attempt == 10:
raise SystemExit(f"Keycloak token request failed after retries: {exc}")
time.sleep(attempt * 2)
token = token_body["access_token"]
smtp = {
"host": os.environ["KEYCLOAK_SMTP_HOST"],
"port": os.environ["KEYCLOAK_SMTP_PORT"],
"from": os.environ["KEYCLOAK_SMTP_FROM"],
"fromDisplayName": os.environ["KEYCLOAK_SMTP_FROM_NAME"],
"replyTo": os.environ["KEYCLOAK_SMTP_FROM"],
"replyToDisplayName": os.environ["KEYCLOAK_SMTP_FROM_NAME"],
"user": os.environ["KEYCLOAK_SMTP_USER"],
"password": os.environ["KEYCLOAK_SMTP_PASSWORD"],
"auth": "true",
"starttls": "true",
"ssl": "false",
}
status, realm_rep = request("GET", f"{base_url}/admin/realms/{realm}", token)
if status == 404:
create_payload = {
"realm": realm,
"enabled": True,
"registrationAllowed": False,
"resetPasswordAllowed": True,
"verifyEmail": True,
"loginWithEmailAllowed": True,
"duplicateEmailsAllowed": False,
"smtpServer": smtp,
}
status, body = request("POST", f"{base_url}/admin/realms", token, create_payload)
if status not in (201, 204, 409):
raise SystemExit(f"Realm create failed: status={status} body={body}")
status, realm_rep = request("GET", f"{base_url}/admin/realms/{realm}", token)
if status != 200 or not isinstance(realm_rep, dict):
raise SystemExit(f"Realm fetch failed: status={status}")
realm_rep.update(
{
"enabled": True,
"registrationAllowed": False,
"resetPasswordAllowed": True,
"verifyEmail": True,
"loginWithEmailAllowed": True,
"duplicateEmailsAllowed": False,
"smtpServer": smtp,
}
)
status, body = request("PUT", f"{base_url}/admin/realms/{realm}", token, realm_rep)
if status not in (200, 204):
raise SystemExit(f"Realm update failed: status={status} body={body}")
def ensure_group(name):
status, groups = request(
"GET",
f"{base_url}/admin/realms/{realm}/groups?search={urllib.parse.quote(name)}",
token,
)
if status != 200:
raise SystemExit(f"Group search failed for {name}: status={status}")
if any(group.get("name") == name for group in groups or []):
return
status, body = request("POST", f"{base_url}/admin/realms/{realm}/groups", token, {"name": name})
if status not in (201, 204, 409):
raise SystemExit(f"Group create failed for {name}: status={status} body={body}")
ensure_group("alpha")
ensure_group("admin")
status, clients = request(
"GET",
f"{base_url}/admin/realms/{realm}/clients?clientId={urllib.parse.quote(client_id)}",
token,
)
if status != 200:
raise SystemExit(f"Client lookup failed: status={status}")
client_uuid = clients[0]["id"] if clients else None
client_payload = {
"clientId": client_id,
"enabled": True,
"protocol": "openid-connect",
"publicClient": False,
"standardFlowEnabled": True,
"implicitFlowEnabled": False,
"directAccessGrantsEnabled": False,
"serviceAccountsEnabled": False,
"redirectUris": [f"{veles_base_url}/*"],
"webOrigins": [veles_base_url],
"rootUrl": veles_base_url,
"baseUrl": "/",
"attributes": {
"pkce.code.challenge.method": "S256",
"post.logout.redirect.uris": f"{veles_base_url}/*",
},
}
if not client_uuid:
status, body = request("POST", f"{base_url}/admin/realms/{realm}/clients", token, client_payload)
if status not in (201, 204, 409):
raise SystemExit(f"Client create failed: status={status} body={body}")
status, clients = request(
"GET",
f"{base_url}/admin/realms/{realm}/clients?clientId={urllib.parse.quote(client_id)}",
token,
)
client_uuid = clients[0]["id"] if clients else None
if not client_uuid:
raise SystemExit("Client veles-web not found after create")
status, body = request(
"PUT",
f"{base_url}/admin/realms/{realm}/clients/{client_uuid}",
token,
client_payload,
)
if status not in (200, 204):
raise SystemExit(f"Client update failed: status={status} body={body}")
mapper_payload = {
"name": "groups",
"protocol": "openid-connect",
"protocolMapper": "oidc-group-membership-mapper",
"consentRequired": False,
"config": {
"full.path": "false",
"id.token.claim": "true",
"access.token.claim": "true",
"userinfo.token.claim": "true",
"claim.name": "groups",
},
}
status, mappers = request(
"GET",
f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/protocol-mappers/models",
token,
)
if status != 200:
raise SystemExit(f"Mapper lookup failed: status={status}")
mapper_id = next((mapper.get("id") for mapper in mappers or [] if mapper.get("name") == "groups"), None)
if mapper_id:
status, body = request(
"PUT",
f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/protocol-mappers/models/{mapper_id}",
token,
mapper_payload,
)
else:
status, body = request(
"POST",
f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/protocol-mappers/models",
token,
mapper_payload,
)
if status not in (200, 201, 204):
raise SystemExit(f"Mapper ensure failed: status={status} body={body}")
status, secret = request(
"GET",
f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/client-secret",
token,
)
client_secret = (secret or {}).get("value")
if status != 200 or not client_secret:
raise SystemExit(f"Client secret fetch failed: status={status}")
vault_addr = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200")
jwt = open("/var/run/secrets/kubernetes.io/serviceaccount/token", encoding="utf-8").read().strip()
login_payload = json.dumps({"jwt": jwt, "role": os.environ.get("VAULT_ROLE", "sso-secrets")}).encode()
req = urllib.request.Request(
f"{vault_addr}/v1/auth/kubernetes/login",
data=login_payload,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=20) as resp:
vault_token = json.loads(resp.read().decode())["auth"]["client_token"]
payload = {
"data": {
"client_id": client_id,
"client_secret": client_secret,
"issuer": issuer,
"realm": realm,
"required_groups": "alpha,admin",
}
}
req = urllib.request.Request(
f"{vault_addr}/v1/kv/data/atlas/veles/veles-oidc",
data=json.dumps(payload).encode(),
headers={"X-Vault-Token": vault_token, "Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=20) as resp:
if resp.status not in (200, 204):
raise SystemExit(f"Vault write returned {resp.status}")
print("Veles Keycloak realm/client ready")
PY

View File

@ -9,7 +9,7 @@ data:
METIS_INVENTORY_PATH: /app/inventory.titan-rpi4.yaml
METIS_DATA_DIR: /var/lib/metis
METIS_DEFAULT_FLASH_HOST: titan-20
METIS_FLASH_HOSTS: titan-20,titan-21,titan-22,titan-24,titan-19,titan-17,titan-15,titan-14,titan-12,titan-11,titan-10,titan-09,titan-08,titan-07,titan-06,titan-05,titan-04,titan-0c,titan-0b,titan-0a
METIS_FLASH_HOSTS: titan-20,titan-21,titan-22,titan-23,titan-24,titan-19,titan-17,titan-15,titan-14,titan-12,titan-11,titan-10,titan-09,titan-08,titan-07,titan-06,titan-05,titan-04,titan-0c,titan-0b,titan-0a
METIS_LOCAL_HOST: titan-20
METIS_ALLOWED_GROUPS: admin,maintenance
METIS_MAX_DEVICE_BYTES: "1000000000000"

View File

@ -38,6 +38,12 @@ spec:
operator: NotIn
values:
- "true"
- key: veles.bstein.dev/node-pool
operator: NotIn
values:
- oceanus
- key: node-role.kubernetes.io/accelerator
operator: Exists
tolerations:
- operator: Exists
containers:

View File

@ -50,6 +50,15 @@ spec:
upgrade:
disableWait: true
values:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: veles.bstein.dev/node-pool
operator: NotIn
values:
- oceanus
rbac:
pspEnabled: false
service:

View File

@ -240,6 +240,11 @@ write_policy_and_role "game-stream" "game-stream" "game-stream-vault" \
"game-stream/*" ""
write_policy_and_role "openclaw" "openclaw" "agent-vault" \
"openclaw/*" ""
write_policy_and_role "veles" "veles" "veles-backend,veles-postgres,veles-vault-sync" \
"veles/* shared/harbor-pull shared/postmark-relay" ""
write_policy_and_role "veles-secrets" "veles" "veles-secrets-ensure" \
"shared/postmark-relay" \
"veles/*"
write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync,metis" \
"maintenance/ariadne-db maintenance/metis-oidc maintenance/soteria-oidc maintenance/metis-ssh-keys maintenance/metis-runtime portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull shared/soteria-restic harbor/harbor-core" "" \
'
@ -266,8 +271,8 @@ write_policy_and_role "vault" "vault" "vault" \
"vault/*" ""
write_policy_and_role "sso-secrets" "sso" "mas-secrets-ensure" \
"shared/keycloak-admin maintenance/metis-ssh-keys" \
"harbor/harbor-oidc vault/vault-oidc-config comms/synapse-oidc logging/oauth2-proxy-logs-oidc finance/actual-oidc maintenance/metis-oidc maintenance/soteria-oidc maintenance/metis-ssh-keys openclaw/agent-oidc" \
"shared/keycloak-admin shared/postmark-relay maintenance/metis-ssh-keys" \
"harbor/harbor-oidc vault/vault-oidc-config comms/synapse-oidc logging/oauth2-proxy-logs-oidc finance/actual-oidc maintenance/metis-oidc maintenance/soteria-oidc maintenance/metis-ssh-keys openclaw/agent-oidc veles/veles-oidc" \
'
path "kv/data/atlas/nodes/*" {
capabilities = ["create", "update", "read"]

64
services/veles/NOTES.md Normal file
View File

@ -0,0 +1,64 @@
# Veles Infrastructure Contract
This stack is staged for Flux and intentionally starts the app deployments at `replicas: 0` until images and the app-side runtime contract are ready.
## Cluster Contract
- Namespace: `veles`
- Hostname: `https://veles.bstein.dev`
- Namespace: `veles`; no alternate alpha namespace is used.
- Backend service: `veles-backend.veles.svc.cluster.local:80`
- Frontend service: `veles-frontend.veles.svc.cluster.local:80`
- Postgres service: `veles-postgres.veles.svc.cluster.local:5432`
- Artifact PVC: `veles-artifacts`, mounted at `/data/veles-artifacts`
- Storage classes: `veles-oceanus-db`, `veles-oceanus-artifacts`
- Images:
- `registry.bstein.dev/veles/veles-backend`
- `registry.bstein.dev/veles/veles-frontend`
- `registry.bstein.dev/veles/veles-sim-worker`
## Runtime Env
Veles should consume:
- `VELES_PUBLIC_BASE_URL=https://veles.bstein.dev`
- `VELES_OIDC_ISSUER=https://sso.bstein.dev/realms/veles`
- `VELES_OIDC_CLIENT_ID=veles-web`
- `VELES_OIDC_REQUIRED_GROUPS=alpha,admin`
- `DATABASE_URL` from `kv/data/atlas/veles/veles-db`
- `VELES_SESSION_SECRET` from `kv/data/atlas/veles/app-secrets`
- `VELES_BYOK_ENCRYPTION_KEY` from `kv/data/atlas/veles/app-secrets`
User OpenAI API keys must stay in the Veles database encrypted with `VELES_BYOK_ENCRYPTION_KEY`; do not store per-user BYOK secrets in Vault.
## Simulation Jobs
The backend service account can create, watch, and delete Jobs only inside the `veles` namespace. Simulation pods should use service account `veles-sim`, set `automountServiceAccountToken: false`, and use:
```yaml
priorityClassName: veles-sim
nodeSelector:
veles.bstein.dev/simulation: "true"
tolerations:
- key: veles.bstein.dev/simulation
operator: Equal
value: "true"
effect: NoSchedule
```
## Staged Operator Steps
1. Join `titan-23`/Oceanus to Atlas as a worker.
2. Use Metis with `titan-23` in `METIS_FLASH_HOSTS`; the existing node secret placeholder uses `192.168.22.23`.
3. Confirm the node normalizer applies the Veles labels and taint.
4. Add Oceanus Longhorn disks at paths tagged by the Longhorn tag ensure job.
5. Let Vault policy reconciliation run, then unsuspend `veles-secrets-ensure-1`.
6. Unsuspend `veles-realm-ensure-1` in `services/keycloak` to create the realm/client secret.
7. Create the Harbor `veles` project or robot access before image automation is enabled in production.
8. Scale `veles-postgres`, then backend/frontend once app images exist.
## Assumptions
- `veles-oceanus-artifacts` is RWO for alpha; simulation workers should either run on Oceanus with the backend or stream logs to the backend, which owns writes.
- Postgres uses Longhorn backup recurring jobs off Oceanus. This is not a substitute for a tested restore drill.
- The Jenkins job skeleton points at the Veles repo but stays disabled until that repo provides a Jenkinsfile.

View File

@ -0,0 +1,16 @@
# services/veles/artifacts-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: veles-artifacts
namespace: veles
labels:
app.kubernetes.io/name: veles
app.kubernetes.io/component: artifacts
spec:
accessModes:
- ReadWriteOnce
storageClassName: veles-oceanus-artifacts
resources:
requests:
storage: 200Gi

View File

@ -0,0 +1,89 @@
# services/veles/backend-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: veles-backend
namespace: veles
labels:
app: veles-backend
spec:
replicas: 0
revisionHistoryLimit: 2
selector:
matchLabels:
app: veles-backend
template:
metadata:
labels:
app: veles-backend
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "veles"
vault.hashicorp.com/agent-inject-secret-veles-env.sh: "kv/data/atlas/veles/veles-db"
vault.hashicorp.com/agent-inject-template-veles-env.sh: |
{{- with secret "kv/data/atlas/veles/veles-db" }}
export DATABASE_URL="{{ .Data.data.DATABASE_URL }}"
export VELES_DATABASE_USER="{{ .Data.data.POSTGRES_USER }}"
export VELES_DATABASE_PASSWORD="{{ .Data.data.POSTGRES_PASSWORD }}"
{{- end }}
{{- with secret "kv/data/atlas/veles/veles-oidc" }}
export VELES_OIDC_CLIENT_SECRET="{{ .Data.data.client_secret }}"
{{- end }}
{{- with secret "kv/data/atlas/veles/app-secrets" }}
export VELES_SESSION_SECRET="{{ .Data.data.VELES_SESSION_SECRET }}"
export VELES_BYOK_ENCRYPTION_KEY="{{ .Data.data.VELES_BYOK_ENCRYPTION_KEY }}"
{{- end }}
spec:
serviceAccountName: veles-backend
priorityClassName: veles-core
nodeSelector:
veles.bstein.dev/node-pool: oceanus
tolerations:
- key: veles.bstein.dev/simulation
operator: Equal
value: "true"
effect: NoSchedule
securityContext:
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers:
- name: backend
image: registry.bstein.dev/veles/veles-backend:0.1.0-0 # {"$imagepolicy": "veles:veles-backend"}
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
if [ -f /vault/secrets/veles-env.sh ]; then
. /vault/secrets/veles-env.sh
fi
exec /app/veles-backend
ports:
- name: http
containerPort: 8080
protocol: TCP
envFrom:
- configMapRef:
name: veles-app-config
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: "2"
memory: 4Gi
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: artifacts
mountPath: /data/veles-artifacts
volumes:
- name: artifacts
persistentVolumeClaim:
claimName: veles-artifacts

View File

@ -0,0 +1,23 @@
# services/veles/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: veles-app-config
namespace: veles
data:
VELES_ENV: alpha
VELES_PUBLIC_BASE_URL: https://veles.bstein.dev
VELES_OIDC_ISSUER: https://sso.bstein.dev/realms/veles
VELES_OIDC_CLIENT_ID: veles-web
VELES_OIDC_REQUIRED_GROUPS: alpha,admin
VELES_DATABASE_HOST: veles-postgres.veles.svc.cluster.local
VELES_DATABASE_PORT: "5432"
VELES_DATABASE_NAME: veles
VELES_ARTIFACTS_PATH: /data/veles-artifacts
VELES_SIM_NAMESPACE: veles
VELES_SIM_SERVICE_ACCOUNT: veles-sim
VELES_SIM_PRIORITY_CLASS: veles-sim
VELES_SIM_NODE_SELECTOR: veles.bstein.dev/simulation=true
VELES_SIM_TOLERATION_KEY: veles.bstein.dev/simulation
VELES_SIM_TOLERATION_VALUE: "true"
VELES_LOG_RETENTION_DAYS: "30"

View File

@ -0,0 +1,72 @@
# services/veles/frontend-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: veles-frontend
namespace: veles
labels:
app: veles-frontend
spec:
replicas: 0
revisionHistoryLimit: 2
selector:
matchLabels:
app: veles-frontend
template:
metadata:
labels:
app: veles-frontend
spec:
serviceAccountName: veles-frontend
priorityClassName: veles-core
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
- key: hardware
operator: In
values: ["rpi5", "rpi4", "amd64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
securityContext:
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers:
- name: frontend
image: registry.bstein.dev/veles/veles-frontend:0.1.0-0 # {"$imagepolicy": "veles:veles-frontend"}
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 8080
protocol: TCP
envFrom:
- configMapRef:
name: veles-app-config
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]

69
services/veles/image.yaml Normal file
View File

@ -0,0 +1,69 @@
# services/veles/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageRepository
metadata:
name: veles-backend
namespace: veles
spec:
image: registry.bstein.dev/veles/veles-backend
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImagePolicy
metadata:
name: veles-backend
namespace: veles
spec:
imageRepositoryRef:
name: veles-backend
policy:
semver:
range: ">=0.1.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageRepository
metadata:
name: veles-frontend
namespace: veles
spec:
image: registry.bstein.dev/veles/veles-frontend
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImagePolicy
metadata:
name: veles-frontend
namespace: veles
spec:
imageRepositoryRef:
name: veles-frontend
policy:
semver:
range: ">=0.1.0-0"
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageRepository
metadata:
name: veles-sim-worker
namespace: veles
spec:
image: registry.bstein.dev/veles/veles-sim-worker
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImagePolicy
metadata:
name: veles-sim-worker
namespace: veles
spec:
imageRepositoryRef:
name: veles-sim-worker
policy:
semver:
range: ">=0.1.0-0"

View File

@ -0,0 +1,47 @@
# services/veles/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: veles
namespace: veles
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
spec:
ingressClassName: traefik
tls:
- hosts: ["veles.bstein.dev"]
secretName: veles-tls
rules:
- host: veles.bstein.dev
http:
paths:
- path: /api
pathType: Prefix
backend:
service:
name: veles-backend
port:
number: 80
- path: /events
pathType: Prefix
backend:
service:
name: veles-backend
port:
number: 80
- path: /ws
pathType: Prefix
backend:
service:
name: veles-backend
port:
number: 80
- path: /
pathType: Prefix
backend:
service:
name: veles-frontend
port:
number: 80

View File

@ -0,0 +1,22 @@
# services/veles/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: veles
resources:
- namespace.yaml
- serviceaccounts.yaml
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- resourcequota.yaml
- limitrange.yaml
- configmap.yaml
- rbac.yaml
- artifacts-pvc.yaml
- postgres-service.yaml
- postgres-statefulset.yaml
- services.yaml
- backend-deployment.yaml
- frontend-deployment.yaml
- image.yaml
- ingress.yaml
- oneoffs/veles-secrets-ensure-job.yaml

View File

@ -0,0 +1,21 @@
# services/veles/limitrange.yaml
apiVersion: v1
kind: LimitRange
metadata:
name: veles-container-limits
namespace: veles
spec:
limits:
- type: Container
defaultRequest:
cpu: 100m
memory: 256Mi
default:
cpu: 500m
memory: 512Mi
min:
cpu: 10m
memory: 32Mi
max:
cpu: "16"
memory: 32Gi

View File

@ -0,0 +1,8 @@
# services/veles/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: veles
labels:
app.kubernetes.io/name: veles
app.kubernetes.io/part-of: veles

View File

@ -0,0 +1,142 @@
# services/veles/oneoffs/veles-secrets-ensure-job.yaml
# One-off job for veles/veles-secrets-ensure-1.
# Purpose: seed Veles Vault paths before app/Postgres pods are scaled up.
# Keep suspended until the veles Vault role has reconciled, then unsuspend once.
apiVersion: batch/v1
kind: Job
metadata:
name: veles-secrets-ensure-1
namespace: veles
spec:
suspend: true
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
spec:
serviceAccountName: veles-secrets-ensure
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: apply
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/bin/sh", "-c"]
args:
- |
set -euo pipefail
vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
vault_role="${VAULT_ROLE:-veles-secrets}"
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
vault_token="$(curl -sS --request POST --data "${login_payload}" \
"${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
echo "vault login failed" >&2
exit 1
fi
read_secret() {
path="$1"
out="$2"
curl -sS -o "${out}" -w "%{http_code}" \
-H "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/kv/data/atlas/${path}" || true
}
write_secret() {
path="$1"
payload="$2"
out="$(mktemp)"
status="$(curl -sS -o "${out}" -w "%{http_code}" -X POST \
-H "X-Vault-Token: ${vault_token}" \
-H "Content-Type: application/json" \
-d "${payload}" \
"${vault_addr}/v1/kv/data/atlas/${path}")"
if [ "${status}" != "200" ] && [ "${status}" != "204" ]; then
echo "Vault write failed for ${path} (status ${status})" >&2
cat "${out}" >&2 || true
exit 1
fi
}
rand_b64() {
bytes="$1"
openssl rand -base64 "${bytes}" | tr -d '\n'
}
status="$(read_secret veles/veles-db /tmp/veles-db.json)"
if [ "${status}" = "200" ]; then
db_password="$(jq -r '.data.data.POSTGRES_PASSWORD // empty' /tmp/veles-db.json)"
elif [ "${status}" = "404" ]; then
db_password=""
else
echo "Vault read failed for veles-db (status ${status})" >&2
cat /tmp/veles-db.json >&2 || true
exit 1
fi
if [ -z "${db_password}" ]; then
db_password="$(rand_b64 36)"
fi
db_payload="$(jq -nc \
--arg host "veles-postgres.veles.svc.cluster.local" \
--arg port "5432" \
--arg db "veles" \
--arg user "veles" \
--arg password "${db_password}" \
'{data:{POSTGRES_HOST:$host,POSTGRES_PORT:$port,POSTGRES_DB:$db,POSTGRES_USER:$user,POSTGRES_PASSWORD:$password,DATABASE_URL:("postgresql://"+$user+":"+$password+"@"+$host+":"+$port+"/"+$db+"?sslmode=disable")}}')"
write_secret veles/veles-db "${db_payload}"
status="$(read_secret veles/app-secrets /tmp/app-secrets.json)"
if [ "${status}" = "200" ]; then
session_secret="$(jq -r '.data.data.VELES_SESSION_SECRET // empty' /tmp/app-secrets.json)"
byok_key="$(jq -r '.data.data.VELES_BYOK_ENCRYPTION_KEY // empty' /tmp/app-secrets.json)"
elif [ "${status}" = "404" ]; then
session_secret=""
byok_key=""
else
echo "Vault read failed for app-secrets (status ${status})" >&2
cat /tmp/app-secrets.json >&2 || true
exit 1
fi
if [ -z "${session_secret}" ]; then
session_secret="$(rand_b64 48)"
fi
if [ -z "${byok_key}" ]; then
byok_key="$(rand_b64 32)"
fi
app_payload="$(jq -nc \
--arg session_secret "${session_secret}" \
--arg byok_key "${byok_key}" \
'{data:{VELES_SESSION_SECRET:$session_secret,VELES_BYOK_ENCRYPTION_KEY:$byok_key}}')"
write_secret veles/app-secrets "${app_payload}"
postmark_status="$(read_secret shared/postmark-relay /tmp/postmark.json)"
if [ "${postmark_status}" = "200" ]; then
smtp_password="$(jq -r '.data.data.apikey // empty' /tmp/postmark.json)"
if [ -n "${smtp_password}" ]; then
smtp_payload="$(jq -nc \
--arg host "mail.bstein.dev" \
--arg port "587" \
--arg user "${smtp_password}" \
--arg password "${smtp_password}" \
--arg from "no-reply-veles@bstein.dev" \
--arg from_name "Veles" \
'{data:{SMTP_HOST:$host,SMTP_PORT:$port,SMTP_USER:$user,SMTP_PASSWORD:$password,SMTP_FROM:$from,SMTP_FROM_NAME:$from_name,SMTP_STARTTLS:"true"}}')"
write_secret veles/smtp "${smtp_payload}"
fi
fi
echo "Veles Vault paths ready: veles-db, app-secrets, smtp when Postmark relay exists"

View File

@ -0,0 +1,17 @@
# services/veles/postgres-service.yaml
apiVersion: v1
kind: Service
metadata:
name: veles-postgres
namespace: veles
labels:
app: veles-postgres
spec:
clusterIP: None
ports:
- name: postgres
port: 5432
protocol: TCP
targetPort: 5432
selector:
app: veles-postgres

View File

@ -0,0 +1,88 @@
# services/veles/postgres-statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: veles-postgres
namespace: veles
labels:
app: veles-postgres
spec:
serviceName: veles-postgres
replicas: 0
selector:
matchLabels:
app: veles-postgres
persistentVolumeClaimRetentionPolicy:
whenDeleted: Retain
whenScaled: Retain
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app: veles-postgres
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "veles"
vault.hashicorp.com/agent-inject-secret-postgres-password: "kv/data/atlas/veles/veles-db"
vault.hashicorp.com/agent-inject-template-postgres-password: |
{{- with secret "kv/data/atlas/veles/veles-db" -}}
{{ .Data.data.POSTGRES_PASSWORD }}
{{- end -}}
spec:
serviceAccountName: veles-postgres
priorityClassName: veles-core
nodeSelector:
veles.bstein.dev/node-pool: oceanus
tolerations:
- key: veles.bstein.dev/simulation
operator: Equal
value: "true"
effect: NoSchedule
securityContext:
fsGroup: 999
seccompProfile:
type: RuntimeDefault
containers:
- name: postgres
image: postgres:15
ports:
- name: postgres
containerPort: 5432
protocol: TCP
env:
- name: PGDATA
value: /var/lib/postgresql/data/pgdata
- name: POSTGRES_USER
value: veles
- name: POSTGRES_PASSWORD_FILE
value: /vault/secrets/postgres-password
- name: POSTGRES_DB
value: veles
resources:
requests:
cpu: "2"
memory: 8Gi
limits:
cpu: "4"
memory: 16Gi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: postgres-data
mountPath: /var/lib/postgresql/data
volumeClaimTemplates:
- metadata:
name: postgres-data
labels:
app: veles-postgres
veles.bstein.dev/backup: longhorn
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: veles-oceanus-db
resources:
requests:
storage: 100Gi

36
services/veles/rbac.yaml Normal file
View File

@ -0,0 +1,36 @@
# services/veles/rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: veles-backend-jobs
namespace: veles
rules:
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["create", "delete", "deletecollection", "get", "list", "patch", "watch"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["delete", "get", "list", "watch"]
- apiGroups: [""]
resources: ["pods/log"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["events"]
verbs: ["get", "list", "watch"]
- apiGroups: ["events.k8s.io"]
resources: ["events"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: veles-backend-jobs
namespace: veles
subjects:
- kind: ServiceAccount
name: veles-backend
namespace: veles
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: veles-backend-jobs

View File

@ -0,0 +1,54 @@
# services/veles/resourcequota.yaml
apiVersion: v1
kind: ResourceQuota
metadata:
name: veles-namespace-quota
namespace: veles
spec:
hard:
requests.cpu: "12"
requests.memory: 24Gi
limits.cpu: "40"
limits.memory: 96Gi
pods: "60"
count/jobs.batch: "100"
persistentvolumeclaims: "8"
requests.storage: 300Gi
---
apiVersion: v1
kind: ResourceQuota
metadata:
name: veles-core-quota
namespace: veles
spec:
hard:
requests.cpu: "4"
requests.memory: 12Gi
limits.cpu: "8"
limits.memory: 24Gi
pods: "12"
scopeSelector:
matchExpressions:
- scopeName: PriorityClass
operator: In
values:
- veles-core
---
apiVersion: v1
kind: ResourceQuota
metadata:
name: veles-sim-quota
namespace: veles
spec:
hard:
requests.cpu: "8"
requests.memory: 16Gi
limits.cpu: "32"
limits.memory: 72Gi
pods: "48"
scopeSelector:
matchExpressions:
- scopeName: PriorityClass
operator: In
values:
- veles-sim

View File

@ -0,0 +1,21 @@
# services/veles/secretproviderclass.yaml
apiVersion: secrets-store.csi.x-k8s.io/v1
kind: SecretProviderClass
metadata:
name: veles-vault
namespace: veles
spec:
provider: vault
parameters:
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
roleName: "veles"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
secretObjects:
- secretName: harbor-regcred
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson

View File

@ -0,0 +1,45 @@
# services/veles/serviceaccounts.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: veles-backend
namespace: veles
imagePullSecrets:
- name: harbor-regcred
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: veles-frontend
namespace: veles
imagePullSecrets:
- name: harbor-regcred
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: veles-postgres
namespace: veles
imagePullSecrets:
- name: harbor-regcred
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: veles-vault-sync
namespace: veles
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: veles-secrets-ensure
namespace: veles
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: veles-sim
namespace: veles
automountServiceAccountToken: false
imagePullSecrets:
- name: harbor-regcred

View File

@ -0,0 +1,32 @@
# services/veles/services.yaml
apiVersion: v1
kind: Service
metadata:
name: veles-backend
namespace: veles
labels:
app: veles-backend
spec:
ports:
- name: http
port: 80
protocol: TCP
targetPort: 8080
selector:
app: veles-backend
---
apiVersion: v1
kind: Service
metadata:
name: veles-frontend
namespace: veles
labels:
app: veles-frontend
spec:
ports:
- name: http
port: 80
protocol: TCP
targetPort: 8080
selector:
app: veles-frontend

View File

@ -0,0 +1,43 @@
# services/veles/vault-sync-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: veles-vault-sync
namespace: veles
labels:
app: veles-vault-sync
spec:
replicas: 1
selector:
matchLabels:
app: veles-vault-sync
template:
metadata:
labels:
app: veles-vault-sync
spec:
serviceAccountName: veles-vault-sync
containers:
- name: sync
image: alpine:3.20
command: ["/bin/sh", "-c"]
args:
- "sleep infinity"
resources:
requests:
cpu: 10m
memory: 16Mi
limits:
cpu: 50m
memory: 64Mi
volumeMounts:
- name: vault-secrets
mountPath: /vault/secrets
readOnly: true
volumes:
- name: vault-secrets
csi:
driver: secrets-store.csi.k8s.io
readOnly: true
volumeAttributes:
secretProviderClass: veles-vault