veles: harden app infrastructure contract

This commit is contained in:
jenkins 2026-06-09 11:59:27 -03:00
parent 07073970cf
commit 6833c3fe61
9 changed files with 146 additions and 35 deletions

View File

@ -28,3 +28,33 @@ spec:
- veles-postgres
retain: 8
concurrency: 1
---
apiVersion: longhorn.io/v1beta2
kind: RecurringJob
metadata:
name: veles-artifacts-backup
namespace: longhorn-system
spec:
name: veles-artifacts-backup
cron: "45 5 * * *"
task: backup
groups:
- veles
- veles-artifacts
retain: 7
concurrency: 1
---
apiVersion: longhorn.io/v1beta2
kind: RecurringJob
metadata:
name: veles-artifacts-snapshot
namespace: longhorn-system
spec:
name: veles-artifacts-snapshot
cron: "15 */6 * * *"
task: snapshot
groups:
- veles
- veles-artifacts
retain: 8
concurrency: 1

View File

@ -15,6 +15,7 @@ parameters:
fsType: ext4
replicaAutoBalance: disabled
dataLocality: strict-local
recurringJobSelector: '[{"name":"veles-artifacts-backup","isGroup":false},{"name":"veles-artifacts-snapshot","isGroup":false}]'
reclaimPolicy: Retain
allowVolumeExpansion: true
volumeBindingMode: WaitForFirstConsumer

View File

@ -1,11 +1,11 @@
# services/keycloak/oneoffs/veles-realm-ensure-job.yaml
# One-off job for sso/veles-realm-ensure-3.
# One-off job for sso/veles-realm-ensure-4.
# Purpose: create the Veles realm, groups, OIDC client, SMTP settings, and Vault client secret.
# Keep suspended until Veles Vault paths/policies have reconciled, then unsuspend once.
apiVersion: batch/v1
kind: Job
metadata:
name: veles-realm-ensure-3
name: veles-realm-ensure-4
namespace: sso
spec:
suspend: true
@ -194,14 +194,65 @@ spec:
)
if status != 200:
raise SystemExit(f"Group search failed for {name}: status={status}")
if any(group.get("name") == name for group in groups or []):
return
for group in groups or []:
if group.get("name") == name:
return group["id"]
status, body = request("POST", f"{base_url}/admin/realms/{realm}/groups", token, {"name": name})
if status not in (201, 204, 409):
raise SystemExit(f"Group create failed for {name}: status={status} body={body}")
status, groups = request(
"GET",
f"{base_url}/admin/realms/{realm}/groups?search={urllib.parse.quote(name)}",
token,
)
if status != 200:
raise SystemExit(f"Group lookup failed after create for {name}: status={status}")
for group in groups or []:
if group.get("name") == name:
return group["id"]
raise SystemExit(f"Group {name} not found after create")
ensure_group("alpha")
ensure_group("admin")
def ensure_role(name):
status, role = request("GET", f"{base_url}/admin/realms/{realm}/roles/{urllib.parse.quote(name)}", token)
if status == 404:
status, body = request("POST", f"{base_url}/admin/realms/{realm}/roles", token, {"name": name})
if status not in (201, 204, 409):
raise SystemExit(f"Role create failed for {name}: status={status} body={body}")
status, role = request(
"GET",
f"{base_url}/admin/realms/{realm}/roles/{urllib.parse.quote(name)}",
token,
)
if status != 200 or not isinstance(role, dict):
raise SystemExit(f"Role lookup failed for {name}: status={status}")
return role
def ensure_group_role(group_id, role):
status, mappings = request(
"GET",
f"{base_url}/admin/realms/{realm}/groups/{group_id}/role-mappings/realm",
token,
)
if status != 200:
raise SystemExit(f"Group role mapping lookup failed: status={status}")
if any(mapping.get("name") == role["name"] for mapping in mappings or []):
return
status, body = request(
"POST",
f"{base_url}/admin/realms/{realm}/groups/{group_id}/role-mappings/realm",
token,
[role],
)
if status not in (200, 204):
raise SystemExit(f"Group role mapping failed for {role['name']}: status={status} body={body}")
alpha_group_id = ensure_group("alpha")
admin_group_id = ensure_group("admin")
alpha_role = ensure_role("alpha")
admin_role = ensure_role("admin")
ensure_group_role(alpha_group_id, alpha_role)
ensure_group_role(admin_group_id, alpha_role)
ensure_group_role(admin_group_id, admin_role)
status, clients = request(
"GET",

View File

@ -16,6 +16,13 @@ This stack is staged for Flux and intentionally starts the app deployments at `r
- `registry.bstein.dev/veles/veles-backend`
- `registry.bstein.dev/veles/veles-frontend`
- `registry.bstein.dev/veles/veles-sim-worker`
- Backend/frontend deployments are placeholders and remain scaled to `0` until final image layout, container ports, and health endpoints are confirmed. Services route to a named `http` target port so the numeric container port can change without changing Ingress.
## Auth Contract
Veles owns authorization in the app. The `veles` Ingress does not use oauth2-proxy or Traefik forward-auth, so no ingress/auth layer should strip OIDC token claims. The app should validate tokens from `https://sso.bstein.dev/realms/veles` and expect stable `sub`, `email`, `preferred_username`, `groups`, and `realm_access.roles` claims.
The Keycloak realm setup creates both groups and realm roles named `alpha` and `admin`. Members of the `alpha` group receive the `alpha` realm role; members of `admin` receive both `alpha` and `admin`. Built-in/meta strategies can stay universal, while runs and user-created strategies should remain user-scoped in the Veles database.
## Runtime Env
@ -25,12 +32,20 @@ Veles should consume:
- `VELES_OIDC_ISSUER=https://sso.bstein.dev/realms/veles`
- `VELES_OIDC_CLIENT_ID=veles-web`
- `VELES_OIDC_REQUIRED_GROUPS=alpha,admin`
- `VELES_OIDC_GROUPS_CLAIM=groups`
- `VELES_OIDC_ROLES_CLAIM=realm_access.roles`
- `DATABASE_URL` from `kv/data/atlas/veles/veles-db`
- `VELES_SESSION_SECRET` from `kv/data/atlas/veles/app-secrets`
- `VELES_BYOK_ENCRYPTION_KEY` from `kv/data/atlas/veles/app-secrets`
User OpenAI API keys must stay in the Veles database encrypted with `VELES_BYOK_ENCRYPTION_KEY`; do not store per-user BYOK secrets in Vault.
Backend runtime secrets are synced from Vault by `veles-vault` into the generated Kubernetes Secret `veles-runtime-secrets`; no secret values are committed. The backend consumes that secret with `envFrom`.
## Artifact Contract
`veles-artifacts` is an RWO Longhorn PVC mounted into backend pods at `/data/veles-artifacts`. Backend pods own artifact writes and serving. Simulation Jobs should not mount or write directly to this PVC unless they are explicitly scheduled on Oceanus with the Veles toleration and the app has chosen a same-node direct-write model. Queue-mediated upload/copy through the backend remains the safer default until the app contract settles.
## Simulation Jobs
The backend service account can create, watch, and delete Jobs only inside the `veles` namespace. Simulation pods should use service account `veles-sim`, set `automountServiceAccountToken: false`, and use:
@ -53,12 +68,12 @@ tolerations:
3. Confirm the node normalizer applies the Veles labels and taint.
4. Add Oceanus Longhorn disks at paths tagged by the Longhorn tag ensure job.
5. Let Vault policy reconciliation run, then unsuspend `veles-secrets-ensure-2`.
6. Unsuspend `veles-realm-ensure-3` in `services/keycloak` to create the realm/client secret.
6. Unsuspend `veles-realm-ensure-4` in `services/keycloak` to create the realm/client secret, groups, and roles.
7. Create the Harbor `veles` project or robot access before image automation is enabled in production.
8. Scale `veles-postgres`, then backend/frontend once app images exist.
## Assumptions
- `veles-oceanus-artifacts` is RWO for alpha; simulation workers should either run on Oceanus with the backend or stream logs to the backend, which owns writes.
- Postgres uses Longhorn backup recurring jobs off Oceanus. This is not a substitute for a tested restore drill.
- Longhorn default backup target is `s3://atlas-soteria@us-west-004/` with credential secret `longhorn-backup-b2`; the live `BackupTarget/default` currently reports available. Postgres and artifact volumes have Longhorn recurring snapshot and backup jobs attached by their StorageClasses. This is not a substitute for a tested restore drill.
- The Jenkins job skeleton points at the Veles repo but stays disabled until that repo provides a Jenkinsfile.

View File

@ -7,6 +7,7 @@ metadata:
labels:
app.kubernetes.io/name: veles
app.kubernetes.io/component: artifacts
veles.bstein.dev/backup: longhorn
spec:
accessModes:
- ReadWriteOnce

View File

@ -16,24 +16,6 @@ spec:
metadata:
labels:
app: veles-backend
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "veles"
vault.hashicorp.com/agent-inject-secret-veles-env.sh: "kv/data/atlas/veles/veles-db"
vault.hashicorp.com/agent-inject-template-veles-env.sh: |
{{- with secret "kv/data/atlas/veles/veles-db" }}
export DATABASE_URL="{{ .Data.data.DATABASE_URL }}"
export VELES_DATABASE_USER="{{ .Data.data.POSTGRES_USER }}"
export VELES_DATABASE_PASSWORD="{{ .Data.data.POSTGRES_PASSWORD }}"
{{- end }}
{{- with secret "kv/data/atlas/veles/veles-oidc" }}
export VELES_OIDC_CLIENT_SECRET="{{ .Data.data.client_secret }}"
{{- end }}
{{- with secret "kv/data/atlas/veles/app-secrets" }}
export VELES_SESSION_SECRET="{{ .Data.data.VELES_SESSION_SECRET }}"
export VELES_BYOK_ENCRYPTION_KEY="{{ .Data.data.VELES_BYOK_ENCRYPTION_KEY }}"
{{- end }}
spec:
serviceAccountName: veles-backend
priorityClassName: veles-core
@ -52,13 +34,6 @@ spec:
- name: backend
image: registry.bstein.dev/veles/veles-backend:0.1.0-0 # {"$imagepolicy": "veles:veles-backend"}
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "-c"]
args:
- |
if [ -f /vault/secrets/veles-env.sh ]; then
. /vault/secrets/veles-env.sh
fi
exec /app/veles-backend
ports:
- name: http
containerPort: 8080
@ -66,6 +41,8 @@ spec:
envFrom:
- configMapRef:
name: veles-app-config
- secretRef:
name: veles-runtime-secrets
resources:
requests:
cpu: 500m

View File

@ -10,10 +10,13 @@ data:
VELES_OIDC_ISSUER: https://sso.bstein.dev/realms/veles
VELES_OIDC_CLIENT_ID: veles-web
VELES_OIDC_REQUIRED_GROUPS: alpha,admin
VELES_OIDC_GROUPS_CLAIM: groups
VELES_OIDC_ROLES_CLAIM: realm_access.roles
VELES_DATABASE_HOST: veles-postgres.veles.svc.cluster.local
VELES_DATABASE_PORT: "5432"
VELES_DATABASE_NAME: veles
VELES_ARTIFACTS_PATH: /data/veles-artifacts
VELES_ARTIFACTS_MODE: rwo-backend-owned
VELES_SIM_NAMESPACE: veles
VELES_SIM_SERVICE_ACCOUNT: veles-sim
VELES_SIM_PRIORITY_CLASS: veles-sim

View File

@ -13,9 +13,42 @@ spec:
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
- objectName: "veles-db__DATABASE_URL"
secretPath: "kv/data/atlas/veles/veles-db"
secretKey: "DATABASE_URL"
- objectName: "veles-db__POSTGRES_USER"
secretPath: "kv/data/atlas/veles/veles-db"
secretKey: "POSTGRES_USER"
- objectName: "veles-db__POSTGRES_PASSWORD"
secretPath: "kv/data/atlas/veles/veles-db"
secretKey: "POSTGRES_PASSWORD"
- objectName: "veles-oidc__client_secret"
secretPath: "kv/data/atlas/veles/veles-oidc"
secretKey: "client_secret"
- objectName: "veles-app-secrets__VELES_SESSION_SECRET"
secretPath: "kv/data/atlas/veles/app-secrets"
secretKey: "VELES_SESSION_SECRET"
- objectName: "veles-app-secrets__VELES_BYOK_ENCRYPTION_KEY"
secretPath: "kv/data/atlas/veles/app-secrets"
secretKey: "VELES_BYOK_ENCRYPTION_KEY"
secretObjects:
- secretName: harbor-regcred
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson
- secretName: veles-runtime-secrets
type: Opaque
data:
- objectName: veles-db__DATABASE_URL
key: DATABASE_URL
- objectName: veles-db__POSTGRES_USER
key: VELES_DATABASE_USER
- objectName: veles-db__POSTGRES_PASSWORD
key: VELES_DATABASE_PASSWORD
- objectName: veles-oidc__client_secret
key: VELES_OIDC_CLIENT_SECRET
- objectName: veles-app-secrets__VELES_SESSION_SECRET
key: VELES_SESSION_SECRET
- objectName: veles-app-secrets__VELES_BYOK_ENCRYPTION_KEY
key: VELES_BYOK_ENCRYPTION_KEY

View File

@ -11,7 +11,7 @@ spec:
- name: http
port: 80
protocol: TCP
targetPort: 8080
targetPort: http
selector:
app: veles-backend
---
@ -27,6 +27,6 @@ spec:
- name: http
port: 80
protocol: TCP
targetPort: 8080
targetPort: http
selector:
app: veles-frontend