Compare commits

..

No commits in common. "main" and "codex/testing-dashboard-health-20260604" have entirely different histories.

97 changed files with 358 additions and 4730 deletions

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/ai-llm
targetNamespace: ai
prune: true

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/finance
prune: true
sourceRef:

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/game-stream
targetNamespace: game-stream
prune: true

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/health
prune: true
sourceRef:

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/jellyfin
targetNamespace: jellyfin
prune: true

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/jenkins
prune: true
sourceRef:

View File

@ -28,7 +28,6 @@ resources:
- ai-llm/kustomization.yaml
- openclaw/kustomization.yaml
- game-stream/kustomization.yaml
- veles/kustomization.yaml
- typhon/kustomization.yaml
- nextcloud/kustomization.yaml
- nextcloud-mail-sync/kustomization.yaml

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
sourceRef:
kind: GitRepository
name: flux-system

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
prune: true
sourceRef:
kind: GitRepository

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/nextcloud
targetNamespace: nextcloud
prune: true

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/outline
prune: true
sourceRef:

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/planka
prune: true
sourceRef:

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/quality
prune: true
sourceRef:

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/typhon
prune: true
sourceRef:

View File

@ -8,7 +8,7 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
suspend: false
sourceRef:
kind: GitRepository
name: flux-system

View File

@ -1,29 +0,0 @@
# clusters/atlas/flux-system/applications/veles/image-automation.yaml
# Staged for the first Veles image rollout. Add this file to the parent
# applications kustomization after the namespace exists and the Harbor repos
# have initial tags.
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageUpdateAutomation
metadata:
name: veles
namespace: veles
spec:
interval: 1m0s
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
git:
checkout:
ref:
branch: main
commit:
author:
email: ops@bstein.dev
name: flux-bot
messageTemplate: "chore(veles): automated image update"
push:
branch: main
update:
strategy: Setters
path: services/veles

View File

@ -1,29 +0,0 @@
# clusters/atlas/flux-system/applications/veles/kustomization.yaml
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: veles
namespace: flux-system
annotations:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/veles
targetNamespace: veles
prune: true
sourceRef:
kind: GitRepository
name: flux-system
namespace: flux-system
dependsOn:
- name: cert-manager
- name: core
- name: keycloak
- name: longhorn
- name: traefik
- name: vault
- name: vault-csi
- name: vault-injector
wait: false
timeout: 20m

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/crypto/wallet-monero-temp
targetNamespace: crypto
prune: true

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./services/crypto/xmr-miner
targetNamespace: crypto
prune: true

View File

@ -5966,7 +5966,7 @@ spec:
- args:
- --events-addr=http://notification-controller.$(RUNTIME_NAMESPACE).svc.cluster.local./
- --watch-all-namespaces=true
- --concurrent=4
- --concurrent=1
- --requeue-dependency=5s
- --interval-jitter-percentage=30
- --log-level=info

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 30m
suspend: true
path: ./infrastructure/descheduler
prune: true
sourceRef:

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
timeout: 10m
path: ./services/gitops-ui
prune: true

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./infrastructure/longhorn/ui-ingress
targetNamespace: longhorn-system
prune: true

View File

@ -8,7 +8,6 @@ metadata:
kustomize.toolkit.fluxcd.io/ssa: IfNotPresent
spec:
interval: 10m
suspend: true
path: ./infrastructure/resource-guardrails
prune: true
sourceRef:

View File

@ -24,17 +24,8 @@ spec:
- bash
- -ceu
- |
KUBE_TOKEN_PATH="/var/run/secrets/kubernetes.io/serviceaccount/token"
KUBE_CA_PATH="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
KUBE_SERVER="https://${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT_HTTPS:-443}"
k() {
kubectl \
--server="${KUBE_SERVER}" \
--certificate-authority="${KUBE_CA_PATH}" \
--token="$(cat "${KUBE_TOKEN_PATH}")" \
--request-timeout=10s \
"$@"
kubectl --request-timeout=10s "$@"
}
clear_worker() {
@ -42,9 +33,9 @@ spec:
local hardware="${2}"
if k get node "${node}" >/dev/null 2>&1; then
k label node "${node}" node-role.kubernetes.io/worker=true "hardware=${hardware}" --overwrite=true || true
k label node "${node}" node-role.kubernetes.io/storage-backbone- || true
k label node "${node}" atlas.bstein.dev/spillover- || true
# Recovery cordons are owned by Ananke, not this role reconciler.
k taint node "${node}" node.kubernetes.io/unschedulable:NoSchedule- || true
k uncordon "${node}" || true
else
echo "skipping missing node ${node}"
fi
@ -64,28 +55,9 @@ spec:
k label node titan-22 atlas.bstein.dev/general-compute=last-resort --overwrite=true || true
fi
if k get node titan-23 >/dev/null 2>&1; then
k label node titan-23 \
veles.bstein.dev/simulation=true \
veles.bstein.dev/node-pool=oceanus \
node-role.kubernetes.io/veles-sim=true \
longhorn-host=true \
hardware=oceanus \
--overwrite=true || true
k label node titan-23 node-role.kubernetes.io/worker- || true
k taint node titan-23 veles.bstein.dev/simulation=true:NoSchedule --overwrite=true || true
else
echo "skipping missing node titan-23"
fi
for node in titan-13 titan-15 titan-17 titan-19; do
if k get node "${node}" >/dev/null 2>&1; then
k label node "${node}" \
atlas.bstein.dev/spillover=true \
longhorn-host=true \
node-role.kubernetes.io/worker=true \
node-role.kubernetes.io/storage-backbone=true \
--overwrite=true || true
k label node "${node}" atlas.bstein.dev/spillover=true longhorn-host=true --overwrite=true || true
k taint node "${node}" longhorn=true:PreferNoSchedule --overwrite=true || true
k taint node "${node}" atlas.bstein.dev/spillover=true:PreferNoSchedule --overwrite=true || true
else

View File

@ -6,7 +6,7 @@ metadata:
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "patch", "update"]
verbs: ["get", "list", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding

View File

@ -37,7 +37,7 @@ spec:
createSecret: false
registrySecret: longhorn-registry
image:
pullPolicy: IfNotPresent
pullPolicy: Always
longhorn:
engine:
repository: registry.bstein.dev/infra/longhorn-engine
@ -80,22 +80,11 @@ spec:
repository: registry.bstein.dev/infra/longhorn-livenessprobe
tag: v2.16.0
defaultSettings:
systemManagedPodsImagePullPolicy: if-not-present
taintToleration: veles.bstein.dev/simulation=true:NoSchedule
systemManagedPodsImagePullPolicy: Always
longhornManager:
tolerations:
- key: veles.bstein.dev/simulation
operator: Equal
value: "true"
effect: NoSchedule
nodeSelector:
longhorn-host: "true"
longhornDriver:
tolerations:
- key: veles.bstein.dev/simulation
operator: Equal
value: "true"
effect: NoSchedule
nodeSelector:
longhorn-host: "true"
longhornUI:

View File

@ -7,9 +7,7 @@ resources:
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- helmrelease.yaml
- veles-recurring-jobs.yaml
- longhorn-settings-ensure-job.yaml
- longhorn-csi-toleration-ensure-job.yaml
- longhorn-disk-tags-ensure-job.yaml
configMapGenerator:

View File

@ -1,106 +0,0 @@
# infrastructure/longhorn/core/longhorn-csi-toleration-ensure-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-csi-toleration-ensure-4
namespace: longhorn-system
spec:
backoffLimit: 0
activeDeadlineSeconds: 240
ttlSecondsAfterFinished: 3600
template:
spec:
serviceAccountName: longhorn-service-account
restartPolicy: Never
nodeSelector:
kubernetes.io/hostname: titan-11
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
- key: node-role.kubernetes.io/worker
operator: Exists
containers:
- name: patch
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/bin/sh", "-c"]
args:
- |
set -euo pipefail
ns="longhorn-system"
ds="longhorn-csi-plugin"
key="veles.bstein.dev/simulation"
value="true"
effect="NoSchedule"
patch_daemonset() {
target="$1"
current="$(kubectl -n "${ns}" get daemonset "${target}" -o json)"
if echo "${current}" | jq -e \
--arg key "${key}" \
--arg value "${value}" \
--arg effect "${effect}" \
'.spec.template.spec.tolerations[]? | select(.key == $key and .value == $value and .effect == $effect)' >/dev/null; then
echo "${target} already tolerates ${key}=${value}:${effect}"
return 0
fi
patch="$(echo "${current}" | jq -c \
--arg key "${key}" \
--arg value "${value}" \
--arg effect "${effect}" \
'{
spec: {
template: {
spec: {
tolerations: ((.spec.template.spec.tolerations // []) + [
{key: $key, operator: "Equal", value: $value, effect: $effect}
])
}
}
}
}')"
kubectl -n "${ns}" patch daemonset "${target}" --type=merge -p "${patch}"
}
patch_daemonset "${ds}"
engine_daemonsets="$(kubectl -n "${ns}" get daemonset -l longhorn.io/component=engine-image -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}')"
for engine_ds in ${engine_daemonsets}; do
patch_daemonset "${engine_ds}"
done
csi_ready="false"
for attempt in $(seq 1 90); do
if kubectl get csinode titan-23 -o json | jq -e '.spec.drivers[]? | select(.name == "driver.longhorn.io")' >/dev/null; then
echo "driver.longhorn.io registered on titan-23"
csi_ready="true"
break
fi
sleep 2
done
if [ "${csi_ready}" != "true" ]; then
echo "driver.longhorn.io did not register on titan-23 before timeout" >&2
exit 1
fi
for engine_ds in ${engine_daemonsets}; do
for attempt in $(seq 1 90); do
if kubectl -n "${ns}" get pods -o json | jq -e \
--arg engine_ds "${engine_ds}" \
'.items[] | select(.spec.nodeName == "titan-23") | select(.metadata.ownerReferences[]?.name == $engine_ds) | select([.status.containerStatuses[]?.ready] | all)' >/dev/null; then
echo "${engine_ds} ready on titan-23"
break
fi
if [ "${attempt}" = "90" ]; then
echo "${engine_ds} did not become ready on titan-23 before timeout" >&2
exit 1
fi
sleep 2
done
done

View File

@ -2,7 +2,7 @@
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-disk-tags-ensure-3
name: longhorn-disk-tags-ensure-1
namespace: longhorn-system
spec:
backoffLimit: 0

View File

@ -2,7 +2,7 @@
apiVersion: batch/v1
kind: Job
metadata:
name: longhorn-settings-ensure-10
name: longhorn-settings-ensure-7
namespace: longhorn-system
spec:
backoffLimit: 0
@ -12,8 +12,6 @@ spec:
spec:
serviceAccountName: longhorn-service-account
restartPolicy: Never
nodeSelector:
kubernetes.io/hostname: titan-11
volumes:
- name: longhorn-settings-ensure-script
configMap:

View File

@ -17,28 +17,10 @@ import urllib.request
LONGHORN_NS = "longhorn-system"
LONGHORN_API = "/apis/longhorn.io/v1beta2/namespaces/{namespace}/nodes"
DESIRED_DISK_TAGS = {
"/mnt/astreae": ["astreae"],
"/mnt/asteria": ["asteria"],
"/mnt/veles": ["veles-oceanus", "veles-db", "veles-artifacts"],
"/mnt/veles-db": ["veles-oceanus", "veles-db"],
"/mnt/veles-artifacts": ["veles-oceanus", "veles-artifacts"],
DESIRED_TAGS = {
"/mnt/astreae": "astreae",
"/mnt/asteria": "asteria",
}
DESIRED_NODE_TAGS = {
"titan-23": ["veles-oceanus"],
}
DESIRED_NODE_DISKS = {
"titan-23": {
"veles-oceanus": {
"path": "/mnt/veles",
"allowScheduling": True,
"evictionRequested": False,
"storageReserved": 0,
"tags": ["veles-oceanus", "veles-db", "veles-artifacts"],
}
}
}
DISABLE_DEFAULT_DISK_NODES = {"titan-23"}
def api_base() -> str:
@ -81,30 +63,8 @@ def list_nodes() -> list[dict]:
return data.get("items", [])
def merged_tags(current_tags: list[str], desired_tags: list[str]) -> list[str]:
return sorted(dict.fromkeys([*current_tags, *desired_tags]))
def patch_node_tags(node_name: str, desired_tags: list[str]) -> None:
body = {"spec": {"tags": desired_tags}}
request_json(
"PATCH",
f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}",
body=body,
)
def patch_disk_tags(node_name: str, disk_name: str, desired_tags: list[str]) -> None:
body = {"spec": {"disks": {disk_name: {"tags": desired_tags}}}}
request_json(
"PATCH",
f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}",
body=body,
)
def patch_disks(node_name: str, disks: dict) -> None:
body = {"spec": {"disks": disks}}
def patch_disk_tags(node_name: str, disk_name: str, desired_tag: str) -> None:
body = {"spec": {"disks": {disk_name: {"tags": [desired_tag]}}}}
request_json(
"PATCH",
f"{LONGHORN_API.format(namespace=LONGHORN_NS)}/{node_name}",
@ -118,52 +78,18 @@ def main() -> int:
for node in list_nodes():
name = node.get("metadata", {}).get("name", "")
desired_node_tags = DESIRED_NODE_TAGS.get(name)
if desired_node_tags:
current_node_tags = node.get("spec", {}).get("tags") or []
next_node_tags = merged_tags(current_node_tags, desired_node_tags)
if current_node_tags != next_node_tags:
print(f"patching {name} node tags={current_node_tags!r} -> {next_node_tags!r}")
patch_node_tags(name, next_node_tags)
changed += 1
else:
skipped += 1
spec_disks = node.get("spec", {}).get("disks", {}) or {}
desired_disks = DESIRED_NODE_DISKS.get(name, {})
missing_disks = {
disk_name: disk_spec
for disk_name, disk_spec in desired_disks.items()
if disk_name not in spec_disks
}
if missing_disks:
print(f"adding {name} disks={sorted(missing_disks)}")
patch_disks(name, missing_disks)
changed += len(missing_disks)
spec_disks = {**spec_disks, **missing_disks}
if name in DISABLE_DEFAULT_DISK_NODES:
disable_patch = {}
for disk_name, disk in spec_disks.items():
disk_path = (disk.get("path") or "").rstrip("/")
if disk_path == "/var/lib/longhorn" and disk.get("allowScheduling", True):
disable_patch[disk_name] = {"allowScheduling": False}
if disable_patch:
print(f"disabling default Longhorn scheduling on {name} disks={sorted(disable_patch)}")
patch_disks(name, disable_patch)
changed += len(disable_patch)
for disk_name, disk in spec_disks.items():
disk_path = disk.get("path")
desired_disk_tags = DESIRED_DISK_TAGS.get(disk_path)
if not desired_disk_tags:
desired_tag = DESIRED_TAGS.get(disk_path)
if not desired_tag:
continue
current_tags = disk.get("tags") or []
if current_tags == desired_disk_tags:
if current_tags == [desired_tag]:
skipped += 1
continue
print(f"patching {name}:{disk_name} path={disk_path} tags={current_tags!r} -> {desired_disk_tags!r}")
patch_disk_tags(name, disk_name, desired_disk_tags)
print(f"patching {name}:{disk_name} path={disk_path} tags={current_tags!r} -> {[desired_tag]!r}")
patch_disk_tags(name, disk_name, desired_tag)
changed += 1
print(f"done: changed={changed} skipped={skipped}")

View File

@ -30,25 +30,10 @@ update_setting() {
fi
echo "Setting ${name} -> ${value}"
out="$(mktemp)"
if curl ${curl_opts} -o "${out}" -X PUT \
curl ${curl_opts} -X PUT \
-H "Content-Type: application/json" \
-d "{\"value\":\"${value}\"}" \
"${api_base}/${name}"; then
rm -f "${out}"
return 0
fi
current="$(curl ${curl_opts} "${api_base}/${name}" || true)"
if echo "${current}" | grep -Fq "\"value\":\"${value}\""; then
echo "Setting ${name} stored; Longhorn will apply it when current state allows."
rm -f "${out}"
return 0
fi
cat "${out}" >&2 || true
rm -f "${out}"
return 1
"${api_base}/${name}" >/dev/null
}
wait_for_api
@ -56,7 +41,6 @@ update_setting default-engine-image "registry.bstein.dev/infra/longhorn-engine:v
update_setting default-instance-manager-image "registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2"
update_setting default-backing-image-manager-image "registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2"
update_setting support-bundle-manager-image "registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56"
update_setting taint-toleration "veles.bstein.dev/simulation=true:NoSchedule"
# Keep storage-heavy nodes from getting hammered by rebuild storms and skew.
update_setting replica-auto-balance "best-effort"
update_setting concurrent-replica-rebuild-per-node-limit "2"

View File

@ -1,60 +0,0 @@
# infrastructure/longhorn/core/veles-recurring-jobs.yaml
apiVersion: longhorn.io/v1beta2
kind: RecurringJob
metadata:
name: veles-postgres-backup
namespace: longhorn-system
spec:
name: veles-postgres-backup
cron: "30 5 * * *"
task: backup
groups:
- veles
- veles-postgres
retain: 7
concurrency: 1
---
apiVersion: longhorn.io/v1beta2
kind: RecurringJob
metadata:
name: veles-postgres-snapshot
namespace: longhorn-system
spec:
name: veles-postgres-snapshot
cron: "*/30 * * * *"
task: snapshot
groups:
- veles
- veles-postgres
retain: 8
concurrency: 1
---
apiVersion: longhorn.io/v1beta2
kind: RecurringJob
metadata:
name: veles-artifacts-backup
namespace: longhorn-system
spec:
name: veles-artifacts-backup
cron: "45 5 * * *"
task: backup
groups:
- veles
- veles-artifacts
retain: 7
concurrency: 1
---
apiVersion: longhorn.io/v1beta2
kind: RecurringJob
metadata:
name: veles-artifacts-snapshot
namespace: longhorn-system
spec:
name: veles-artifacts-snapshot
cron: "15 */6 * * *"
task: snapshot
groups:
- veles
- veles-artifacts
retain: 8
concurrency: 1

View File

@ -3,4 +3,3 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- scavenger.yaml
- veles.yaml

View File

@ -1,17 +0,0 @@
# infrastructure/modules/base/priorityclass/veles.yaml
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: veles-core
value: 500
globalDefault: false
description: "For Veles core database, API, and controller workloads"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: veles-sim
value: 50
globalDefault: false
preemptionPolicy: Never
description: "For Veles simulation jobs; lower than core and non-preempting"

View File

@ -5,6 +5,3 @@ resources:
- asteria.yaml
- asteria-encrypted.yaml
- astreae.yaml
- veles-oceanus-db.yaml
- veles-oceanus-artifacts.yaml
- veles-oceanus-policy.yaml

View File

@ -1,21 +0,0 @@
# infrastructure/modules/base/storageclass/veles-oceanus-artifacts.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: veles-oceanus-artifacts
annotations:
veles.bstein.dev/allowed-namespace: veles
provisioner: driver.longhorn.io
parameters:
nodeSelector: veles-oceanus
diskSelector: veles-oceanus,veles-artifacts
fromBackup: ""
numberOfReplicas: "1"
staleReplicaTimeout: "30"
fsType: ext4
replicaAutoBalance: disabled
dataLocality: strict-local
recurringJobSelector: '[{"name":"veles-artifacts-backup","isGroup":false},{"name":"veles-artifacts-snapshot","isGroup":false}]'
reclaimPolicy: Retain
allowVolumeExpansion: true
volumeBindingMode: WaitForFirstConsumer

View File

@ -1,21 +0,0 @@
# infrastructure/modules/base/storageclass/veles-oceanus-db.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: veles-oceanus-db
annotations:
veles.bstein.dev/allowed-namespace: veles
provisioner: driver.longhorn.io
parameters:
nodeSelector: veles-oceanus
diskSelector: veles-oceanus,veles-db
fromBackup: ""
numberOfReplicas: "1"
staleReplicaTimeout: "30"
fsType: ext4
replicaAutoBalance: disabled
dataLocality: strict-local
recurringJobSelector: '[{"name":"veles-postgres-backup","isGroup":false},{"name":"veles-postgres-snapshot","isGroup":false}]'
reclaimPolicy: Retain
allowVolumeExpansion: true
volumeBindingMode: WaitForFirstConsumer

View File

@ -1,25 +0,0 @@
# infrastructure/modules/base/storageclass/veles-oceanus-policy.yaml
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicy
metadata:
name: veles-oceanus-storage-namespace
spec:
failurePolicy: Fail
matchConstraints:
resourceRules:
- apiGroups: [""]
apiVersions: ["v1"]
operations: ["CREATE", "UPDATE"]
resources: ["persistentvolumeclaims"]
validations:
- expression: "!has(object.spec.storageClassName) || !(object.spec.storageClassName in ['veles-oceanus-db', 'veles-oceanus-artifacts']) || object.metadata.namespace == 'veles'"
message: "Veles Oceanus storage classes are reserved for namespace veles"
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicyBinding
metadata:
name: veles-oceanus-storage-namespace
spec:
policyName: veles-oceanus-storage-namespace
validationActions:
- Deny

View File

@ -1,4 +1,4 @@
# Harbor and Longhorn cold-start bootstrap images.
# Harbor cold-start bootstrap images.
registry.bstein.dev/infra/harbor-core:v2.14.1-arm64
registry.bstein.dev/infra/harbor-jobservice:v2.14.1-arm64
registry.bstein.dev/infra/harbor-portal:v2.14.1-arm64
@ -7,18 +7,3 @@ registry.bstein.dev/infra/harbor-registryctl:v2.14.1-arm64
registry.bstein.dev/infra/harbor-redis:v2.14.1-arm64
registry.bstein.dev/infra/harbor-nginx:v2.14.1-arm64
registry.bstein.dev/infra/harbor-prepare:v2.14.1-arm64
# Longhorn must be able to start before Harbor is fully healthy.
registry.bstein.dev/infra/longhorn-engine:v1.8.2
registry.bstein.dev/infra/longhorn-manager:v1.8.2
registry.bstein.dev/infra/longhorn-ui:v1.8.2
registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2
registry.bstein.dev/infra/longhorn-share-manager:v1.8.2
registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2
registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56
registry.bstein.dev/infra/longhorn-csi-attacher:v4.9.0
registry.bstein.dev/infra/longhorn-csi-provisioner:v5.3.0
registry.bstein.dev/infra/longhorn-csi-node-driver-registrar:v2.14.0
registry.bstein.dev/infra/longhorn-csi-resizer:v1.13.2
registry.bstein.dev/infra/longhorn-csi-snapshotter:v8.2.0
registry.bstein.dev/infra/longhorn-livenessprobe:v2.16.0

View File

@ -1,14 +0,0 @@
# Longhorn images needed when Harbor is unhealthy during storage recovery.
registry.bstein.dev/infra/longhorn-engine:v1.8.2
registry.bstein.dev/infra/longhorn-manager:v1.8.2
registry.bstein.dev/infra/longhorn-ui:v1.8.2
registry.bstein.dev/infra/longhorn-instance-manager:v1.8.2
registry.bstein.dev/infra/longhorn-share-manager:v1.8.2
registry.bstein.dev/infra/longhorn-backing-image-manager:v1.8.2
registry.bstein.dev/infra/longhorn-support-bundle-kit:v0.0.56
registry.bstein.dev/infra/longhorn-csi-attacher:v4.9.0
registry.bstein.dev/infra/longhorn-csi-provisioner:v5.3.0
registry.bstein.dev/infra/longhorn-csi-node-driver-registrar:v2.14.0
registry.bstein.dev/infra/longhorn-csi-resizer:v1.13.2
registry.bstein.dev/infra/longhorn-csi-snapshotter:v8.2.0
registry.bstein.dev/infra/longhorn-livenessprobe:v2.16.0

View File

@ -4,9 +4,7 @@ EXPECTED_FLUX_URL="ssh://git@scm.bstein.dev:2242/bstein/titan-iac.git"
SHUTDOWN_MODE="host-poweroff"
STATE_SUBDIR=".local/share/ananke"
HARBOR_BUNDLE_BASENAME="harbor-bootstrap-v2.14.1-arm64.tar.zst"
BOOTSTRAP_BUNDLE_ARCH="arm64"
RECOVERY_UNCORDON_DENYLIST="titan-18,titan-22,titan-24"
HARBOR_TARGET_NODE="titan-11"
HARBOR_TARGET_NODE=""
HARBOR_CANARY_NODE=""
HARBOR_HOST_LABEL_KEY="ananke.bstein.dev/harbor-bootstrap"
HARBOR_CANARY_IMAGE="registry.bstein.dev/bstein/kubectl:1.35.0"
@ -35,4 +33,4 @@ STARTUP_INCLUDE_INGRESS_CHECKS="1"
STARTUP_INGRESS_ALLOWED_STATUSES="200,301,302,307,308,401,403,404"
STARTUP_IGNORE_INGRESS_HOSTS_REGEX=""
STARTUP_INGRESS_CHECK_TIMEOUT_SECONDS="10"
STARTUP_SERVICE_CHECKLIST='gitea|https://scm.bstein.dev/api/healthz|200|"status":"pass"||;grafana|https://metrics.bstein.dev/api/health|200|"database":"ok"||;harbor|https://registry.bstein.dev/v2/|401|unauthorized|<html|'
STARTUP_SERVICE_CHECKLIST='gitea|https://scm.bstein.dev/api/healthz|200|"status":"pass"||;grafana|https://metrics.bstein.dev/api/health|200|"database":"ok"||;harbor|https://registry.bstein.dev/v2/|200,401|||'

View File

@ -5,7 +5,6 @@ IMAGES_FILE="scripts/bootstrap/harbor-bootstrap-images.txt"
BUNDLE_FILE="artifacts/harbor-bootstrap-v2.14.1-arm64.tar.zst"
DOCKER_CONFIG_PATH=""
PLATFORM="linux/arm64"
ZSTD_LEVEL="${ZSTD_LEVEL:-19}"
while [[ $# -gt 0 ]]; do
case "$1" in
@ -25,13 +24,9 @@ while [[ $# -gt 0 ]]; do
PLATFORM="${2:?missing platform}"
shift 2
;;
--zstd-level)
ZSTD_LEVEL="${2:?missing zstd compression level}"
shift 2
;;
-h|--help)
cat <<USAGE
Usage: scripts/build_harbor_bootstrap_bundle.sh [--images-file <path>] [--bundle-file <path>] [--docker-config <path>] [--platform <linux/arm64>] [--zstd-level <level>]
Usage: scripts/build_harbor_bootstrap_bundle.sh [--images-file <path>] [--bundle-file <path>] [--docker-config <path>] [--platform <linux/arm64>]
USAGE
exit 0
;;
@ -52,54 +47,12 @@ if [[ ${#IMAGES[@]} -eq 0 ]]; then
exit 1
fi
source_image_for_alias() {
local image="$1"
local tag="${image##*:}"
case "${image}" in
registry.bstein.dev/infra/longhorn-engine:*) echo "docker.io/longhornio/longhorn-engine:${tag}" ;;
registry.bstein.dev/infra/longhorn-manager:*) echo "docker.io/longhornio/longhorn-manager:${tag}" ;;
registry.bstein.dev/infra/longhorn-ui:*) echo "docker.io/longhornio/longhorn-ui:${tag}" ;;
registry.bstein.dev/infra/longhorn-instance-manager:*) echo "docker.io/longhornio/longhorn-instance-manager:${tag}" ;;
registry.bstein.dev/infra/longhorn-share-manager:*) echo "docker.io/longhornio/longhorn-share-manager:${tag}" ;;
registry.bstein.dev/infra/longhorn-backing-image-manager:*) echo "docker.io/longhornio/backing-image-manager:${tag}" ;;
registry.bstein.dev/infra/longhorn-support-bundle-kit:*) echo "docker.io/longhornio/support-bundle-kit:${tag}" ;;
registry.bstein.dev/infra/longhorn-csi-attacher:*) echo "registry.k8s.io/sig-storage/csi-attacher:${tag}" ;;
registry.bstein.dev/infra/longhorn-csi-provisioner:*) echo "registry.k8s.io/sig-storage/csi-provisioner:${tag}" ;;
registry.bstein.dev/infra/longhorn-csi-node-driver-registrar:*) echo "registry.k8s.io/sig-storage/csi-node-driver-registrar:${tag}" ;;
registry.bstein.dev/infra/longhorn-csi-resizer:*) echo "registry.k8s.io/sig-storage/csi-resizer:${tag}" ;;
registry.bstein.dev/infra/longhorn-csi-snapshotter:*) echo "registry.k8s.io/sig-storage/csi-snapshotter:${tag}" ;;
registry.bstein.dev/infra/longhorn-livenessprobe:*) echo "registry.k8s.io/sig-storage/livenessprobe:${tag}" ;;
*) echo "${image}" ;;
esac
}
pull_or_tag_image() {
local image="$1"
local source_image
if docker image inspect "${image}" >/dev/null 2>&1; then
echo "Using cached ${image}" >&2
return 0
fi
echo "Pulling ${image}" >&2
if docker pull --platform "${PLATFORM}" "${image}" >/dev/null; then
return 0
fi
source_image="$(source_image_for_alias "${image}")"
if [[ "${source_image}" == "${image}" ]]; then
return 1
fi
echo "Pulling ${source_image} for ${image}" >&2
docker pull --platform "${PLATFORM}" "${source_image}" >/dev/null
docker tag "${source_image}" "${image}"
}
mkdir -p "$(dirname "${BUNDLE_FILE}")"
for image in "${IMAGES[@]}"; do
pull_or_tag_image "${image}"
echo "Pulling ${image}" >&2
docker pull --platform "${PLATFORM}" "${image}" >/dev/null
done
tmp_bundle="${BUNDLE_FILE}.tmp"
rm -f "${tmp_bundle}"
docker save "${IMAGES[@]}" | zstd -T0 -"${ZSTD_LEVEL}" -o "${tmp_bundle}"
mv "${tmp_bundle}" "${BUNDLE_FILE}"
docker save "${IMAGES[@]}" | zstd -T0 -19 -o "${BUNDLE_FILE}"
echo "Wrote ${BUNDLE_FILE}" >&2

File diff suppressed because it is too large Load Diff

View File

@ -607,7 +607,6 @@ PLATFORM_TEST_SUITE_NAMES = [
"titan_iac",
"bstein_home",
"data_prepper",
"lesavka",
]
PLATFORM_TEST_SUCCESS_STATUS = "ok|passed|success"
PLATFORM_TEST_NON_FAILURE_STATUS = f"{PLATFORM_TEST_SUCCESS_STATUS}|not_applicable|skipped|na|n/a"
@ -638,7 +637,6 @@ PLATFORM_TEST_SUITE_VALUE_BY_NAME = {
"titan_iac": "titan_iac|titan-iac",
"bstein_home": "bstein_home|bstein-home",
"data_prepper": "data_prepper|data-prepper",
"lesavka": "lesavka",
}
PLATFORM_TEST_JENKINS_JOB_BY_SUITE = {
"ariadne": "ariadne",
@ -650,7 +648,6 @@ PLATFORM_TEST_JENKINS_JOB_BY_SUITE = {
"titan_iac": "titan-iac",
"bstein_home": "bstein-dev-home",
"data_prepper": "data-prepper",
"lesavka": "lesavka",
}
JENKINS_UI_BASE_DEFAULT = "https://ci.bstein.dev"
PLATFORM_TEST_SUITE_MATCHER = "|".join(
@ -676,23 +673,17 @@ PLATFORM_TEST_SOURCE_LINES_OVER_500_ROLLUP = (
)
PLATFORM_TEST_SONAR_HEALTH_ROLLUP = "platform_quality:sonar_gate_health_percent:latest_1h"
PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP = "platform_quality:test_category_health_rate:percent_1h"
PLATFORM_TEST_HISTORY_WINDOW = "7d"
PLATFORM_TEST_HISTORY_STEP = "1h"
PLATFORM_TEST_BRANCH_EVIDENCE_WINDOW = "7d"
PLATFORM_TEST_BRANCH_EVIDENCE_STEP = "1h"
PLATFORM_TEST_CASE_DISCOVERY_WINDOW = "24h"
PLATFORM_TEST_CASE_PANEL_WINDOW = "24h"
PLATFORM_TEST_SUCCESS_EVENTS_30D = (
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) or on() vector(0))'
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")}) or on() vector(0))'
)
PLATFORM_TEST_TOTAL_EVENTS_30D = (
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) or on() vector(0))'
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")}) or on() vector(0))'
)
PLATFORM_TEST_SUCCESS_EVENTS_7D = (
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) or on() vector(0))'
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", "7d")}) or on() vector(0))'
)
PLATFORM_TEST_TOTAL_EVENTS_7D = (
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) or on() vector(0))'
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "7d")}) or on() vector(0))'
)
PLATFORM_TEST_SUCCESS_EVENTS_24H = (
f'(sum({PLATFORM_TEST_RUNS_24H_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}"}}) or on() vector(0))'
@ -716,7 +707,7 @@ PLATFORM_TEST_FAILURES_24H_BY_SUITE = (
f'sort_desc(sum by (suite) ({PLATFORM_TEST_RUNS_24H_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status!~"{PLATFORM_TEST_SUCCESS_STATUS}"}}))'
)
PLATFORM_TEST_ACTIVITY_30D = (
f'sum by (suite, status) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)})'
f'sum by (suite, status) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")})'
)
PLATFORM_TEST_RUNS_24H_TOTAL = PLATFORM_TEST_TOTAL_EVENTS_24H
PLATFORM_TEST_ACTIVE_SUITES_24H = (
@ -725,7 +716,7 @@ PLATFORM_TEST_ACTIVE_SUITES_24H = (
)
PLATFORM_TEST_POINT_WINDOW = "1h"
PLATFORM_TEST_FRESH_WINDOW = "30h"
PLATFORM_TEST_LATEST_WINDOW = "7d"
PLATFORM_TEST_LATEST_WINDOW = "30d"
def platform_check_status_expr(
@ -838,7 +829,7 @@ PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
f'/ clamp_min((sum by (suite) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "24h")})), 1))'
)
QUALITY_GATE_SUITE_INDEX_30D = (
f'sum by (suite) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)})'
f'sum by (suite) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")})'
)
QUALITY_GATE_COVERAGE_BY_SUITE = (
f'max by (suite) ({PLATFORM_TEST_COVERAGE_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}"}})'
@ -1575,11 +1566,11 @@ def testing_case_variable():
"label": "Test Case",
"type": "query",
"query": (
"query_result(topk(75, count by (test) (max_over_time("
"query_result(topk(250, count by (test) (max_over_time("
f'platform_quality:test_case_health_rate:percent_1h{{suite=~"${{suite:regex}}",branch!="",'
f'branch=~"${{branch:regex}}",test!="",test!="__no_test_cases__",'
f'category!~"{PLATFORM_TEST_SUPPORT_CATEGORY_REGEX}"}}'
f"[{PLATFORM_TEST_CASE_DISCOVERY_WINDOW}:{PLATFORM_TEST_HISTORY_STEP}]))))"
"[$__range]))))"
),
"regex": '/test="([^"]+)"/',
"current": {"text": "All", "value": "$__all", "selected": True},
@ -1920,7 +1911,7 @@ OVERVIEW_PANEL_DESCRIPTIONS = {
TESTING_PANEL_DESCRIPTIONS = {
"Current Gate Health (%)": "Average latest required gate checks passing across selected suites; this is the current quality state.",
"CI Run Success Rate (24h)": "Percent of selected quality-gate CI runs that completed successfully in 24h; this is run health, not individual test pass rate.",
"CI Run Success Rate (7d)": "Percent of selected quality-gate CI runs that completed successfully in 7d; higher means more stable automation.",
"CI Run Success Rate (30d)": "Percent of selected quality-gate CI runs that completed successfully in 30d; higher means more stable automation.",
"Failed Runs (24h)": "Selected quality-gate runs that failed in 24h; zero is good and anything else needs a look.",
"CI Runs (24h)": "Selected quality-gate CI run count in 24h; zero means the dashboard may be stale.",
"Suite Freshness (24h)": "Percent of selected suites with at least one quality-gate CI run in 24h; 100% means inputs are fresh.",
@ -1954,7 +1945,7 @@ TESTING_PANEL_DESCRIPTIONS = {
"Supply Chain Healthy Rate": "Percent of supply-chain checks passing or not applicable; higher is better.",
"Test Drilldowns And Problem Tests": "Test-case detail for finding which tests are hurting reliability.",
"Problematic Tests Over Time (Top failures)": "Current outlier tests by rolling 24h failures; tests need repeat failures to stay visible.",
"Most Problematic Test by Suite (7d)": "Worst test per suite summed over 7d; high counts can be historical debt.",
"Most Problematic Test by Suite (30d)": "Worst test per suite summed over 30d; high counts can be historical debt.",
"Selected Test Pass/Fail History": "Hourly pass/fail/skipped volume for the selected test filter.",
"Selected Test Pass Rate History": "Pass rate history for the selected test filter; higher means the test is stable.",
"Telemetry Completeness And Branches": "Checks that each suite publishes the data this dashboard needs.",
@ -1964,8 +1955,8 @@ TESTING_PANEL_DESCRIPTIONS = {
"LOC Compliance Metrics Present by Suite": "Whether LOC metrics are present; 100% means size panels are reliable.",
"Test-Case Metrics Present by Suite": "Whether per-test metrics are present; 100% enables drilldowns.",
"Real Test Cases Present by Suite": "Whether real test names are present; 100% means not just placeholder telemetry.",
"Recent Branch Evidence by Suite (7d)": "Branches with recent CI evidence; unexpected branches can mean drift or stale work.",
"Primary Branch Clean by Suite (7d)": "Percent clean of non-primary branch evidence; 100% means only main/master is reporting.",
"Recent Branch Evidence by Suite (30d)": "Branches with recent CI evidence; unexpected branches can mean drift or stale work.",
"Primary Branch Clean by Suite (30d)": "Percent clean of non-primary branch evidence; 100% means only main/master is reporting.",
"SonarQube Project Health": "SonarQube availability, projects, fetch errors, and gate status.",
"SonarQube API Up": "Whether the SonarQube exporter can reach SonarQube; 1 is good.",
"Sonar Projects (Selected)": "Selected SonarQube project count; zero means Sonar is not tracking that suite.",
@ -4050,18 +4041,14 @@ def build_jobs_dashboard():
f'branch=~"{branch_var}",status!~"{success}"}}'
)
runs_24h = f'(sum({runs_24h_rollup_selector}) or on() vector(0))'
runs_history = (
f'(sum({platform_runs_increase(runs_selector, PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) '
"or on() vector(0))"
)
runs_30d = f'(sum({platform_runs_increase(runs_selector, "30d", "15m")}) or on() vector(0))'
success_24h = f'(sum({runs_24h_success_rollup_selector}) or on() vector(0))'
success_history_total = (
f'(sum({platform_runs_increase(runs_success_selector, PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) '
"or on() vector(0))"
success_30d = (
f'(sum({platform_runs_increase(runs_success_selector, "30d", "15m")}) or on() vector(0))'
)
failures_24h = f'(sum({runs_24h_failure_rollup_selector}) or on() vector(0))'
success_rate_24h = f"100 * ({success_24h}) / clamp_min(({runs_24h}), 1)"
success_rate_history = f"100 * ({success_history_total}) / clamp_min(({runs_history}), 1)"
success_rate_30d = f"100 * ({success_30d}) / clamp_min(({runs_30d}), 1)"
runs_by_suite_24h = f"sum by (suite) ({runs_24h_rollup_selector})"
success_by_suite_24h = f"sum by (suite) ({runs_24h_success_rollup_selector})"
success_rate_by_suite_24h = (
@ -4099,11 +4086,9 @@ def build_jobs_dashboard():
f"100 * (sum(({runs_by_suite_24h}) > bool 0) or on() vector(0)) "
f"/ clamp_min(count(({selected_suite_universe})), 1)"
)
success_history_runs = (
f"sum by (suite) ({platform_runs_increase(runs_selector, PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)})"
)
success_history_runs = f'sum by (suite) ({platform_runs_increase(runs_selector, "7d")})'
success_history_by_suite = (
f"(100 * sum by (suite) ({platform_runs_increase(runs_success_selector, PLATFORM_TEST_HISTORY_WINDOW, PLATFORM_TEST_HISTORY_STEP)}) "
f'(100 * sum by (suite) ({platform_runs_increase(runs_success_selector, "7d")}) '
f'/ ({success_history_runs})) and on(suite) (({success_history_runs}) > 0)'
)
daily_success_volume = (
@ -4190,11 +4175,11 @@ def build_jobs_dashboard():
f"and on (suite, test) topk(12, ({current_problem_test_candidates}) >= 2)"
)
problematic_tests_history = problematic_tests_history_core
rollup_failed_tests_history = (
f'sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}[{PLATFORM_TEST_HISTORY_WINDOW}:{PLATFORM_TEST_HISTORY_STEP}]))'
rollup_failed_tests_30d = (
f'sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}[30d:1h]))'
)
worst_test_per_suite_core = (
f"topk by (suite) (1, ({rollup_failed_tests_history}))"
f"topk by (suite) (1, ({rollup_failed_tests_30d}))"
)
worst_test_per_suite = worst_test_per_suite_core
@ -4231,13 +4216,13 @@ def build_jobs_dashboard():
f'branch!="",branch=~"{branch_var}",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})'
)
recent_branch_evidence = (
f'sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[{PLATFORM_TEST_BRANCH_EVIDENCE_WINDOW}:{PLATFORM_TEST_BRANCH_EVIDENCE_STEP}])))'
f'sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[30d:15m])))'
)
non_primary_branch_evidence = (
f'count by (suite) (max_over_time(platform_quality_gate_build_info{{{build_info_selector},branch!~"main|master|origin/main|origin/master|unknown"}}[{PLATFORM_TEST_BRANCH_EVIDENCE_WINDOW}:{PLATFORM_TEST_BRANCH_EVIDENCE_STEP}]))'
f'count by (suite) (max_over_time(platform_quality_gate_build_info{{{build_info_selector},branch!~"main|master|origin/main|origin/master|unknown"}}[30d:15m]))'
)
branch_evidence_by_suite = (
f'count by (suite) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[{PLATFORM_TEST_BRANCH_EVIDENCE_WINDOW}:{PLATFORM_TEST_BRANCH_EVIDENCE_STEP}]))'
f'count by (suite) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[30d:15m]))'
)
primary_branch_clean_by_suite = (
f'(100 * ((({branch_evidence_by_suite}) > bool 0) '
@ -4354,8 +4339,8 @@ def build_jobs_dashboard():
panels.append(
stat_panel(
3,
"CI Run Success Rate (7d)",
success_rate_history,
"CI Run Success Rate (30d)",
success_rate_30d,
{"h": 5, "w": 4, "x": 4, "y": 0},
unit="percent",
decimals=2,
@ -4476,7 +4461,6 @@ def build_jobs_dashboard():
"so failed or aborted runs lower the lane color without implying raw test failures."
),
)
history_panel["timeFrom"] = PLATFORM_TEST_HISTORY_WINDOW
panels.append(history_panel)
run_volume_panel = timeseries_panel(
@ -4492,7 +4476,6 @@ def build_jobs_dashboard():
legend_display="list",
legend_placement="bottom",
legend_calcs=[],
time_from=PLATFORM_TEST_HISTORY_WINDOW,
)
run_volume_panel["description"] = (
"Twenty-four-hour rolling quality-gate run counts for the selected suite/branch scope. "
@ -4508,32 +4491,32 @@ def build_jobs_dashboard():
}
panels.append(run_volume_panel)
coverage_history_panel = state_timeline_panel(
13,
"Coverage History by Suite",
coverage_history_by_suite,
{"h": 8, "w": 8, "x": 8, "y": 21},
thresholds=coverage_thresholds,
description=(
"Latest reported line coverage per suite over time. Coverage is separate "
"from LOC compliance so one signal cannot hide the other."
),
panels.append(
state_timeline_panel(
13,
"Coverage History by Suite",
coverage_history_by_suite,
{"h": 8, "w": 8, "x": 8, "y": 21},
thresholds=coverage_thresholds,
description=(
"Latest reported line coverage per suite over time. Coverage is separate "
"from LOC compliance so one signal cannot hide the other."
),
)
)
coverage_history_panel["timeFrom"] = PLATFORM_TEST_HISTORY_WINDOW
panels.append(coverage_history_panel)
loc_history_panel = state_timeline_panel(
14,
"Files <=500 LOC History by Suite",
loc_limit_compliance_history,
{"h": 8, "w": 8, "x": 16, "y": 21},
thresholds=success_thresholds,
description=(
"Percent of LOC-gated source files at or under the 500-line limit. "
"This uses the existing file-count telemetry; longest-file history needs a new publisher metric."
),
panels.append(
state_timeline_panel(
14,
"Files <=500 LOC History by Suite",
loc_limit_compliance_history,
{"h": 8, "w": 8, "x": 16, "y": 21},
thresholds=success_thresholds,
description=(
"Percent of LOC-gated source files at or under the 500-line limit. "
"This uses the existing file-count telemetry; longest-file history needs a new publisher metric."
),
)
)
loc_history_panel["timeFrom"] = PLATFORM_TEST_HISTORY_WINDOW
panels.append(loc_history_panel)
check_dimensions = [
("Tests", check_regex_tests),
@ -4563,7 +4546,6 @@ def build_jobs_dashboard():
thresholds=trend_thresholds,
description=trend_description,
)
panel["timeFrom"] = PLATFORM_TEST_HISTORY_WINDOW
panels.append(panel)
for index, (label, regex) in enumerate(check_dimensions[4:]):
panel = state_timeline_panel(
@ -4574,7 +4556,6 @@ def build_jobs_dashboard():
thresholds=trend_thresholds,
description=trend_description,
)
panel["timeFrom"] = PLATFORM_TEST_HISTORY_WINDOW
panels.append(panel)
_append_check_trends(130, "Failure Rate", True, 29)
@ -4596,13 +4577,12 @@ def build_jobs_dashboard():
),
)
)
panels[-1]["timeFrom"] = PLATFORM_TEST_CASE_PANEL_WINDOW
panels[-1]["links"] = jenkins_suite_links()
panels[-1]["fieldConfig"]["defaults"]["links"] = jenkins_latest_artifact_data_links()
panels.append(
bargauge_panel(
147,
"Most Problematic Test by Suite (7d)",
"Most Problematic Test by Suite (30d)",
worst_test_per_suite,
{"h": 8, "w": 12, "x": 12, "y": 57},
unit="none",
@ -4616,8 +4596,8 @@ def build_jobs_dashboard():
)
)
panels[-1]["description"] = (
"Worst test per suite summed across 7d. This catches repeat offenders while keeping dashboard "
"loads bounded; current hourly top list is quiet."
"Worst test per suite summed across 30d. This catches historical repeat offenders even when the "
"current hourly top list is quiet."
)
panels.append(
timeseries_panel(
@ -4636,9 +4616,8 @@ def build_jobs_dashboard():
)
panels[-1]["description"] = (
"Stacked hourly outcome volume for the selected suite/branch/test scope. "
"This uses vmalert rollups only, avoiding expensive raw long-range per-test scans."
"This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans."
)
panels[-1]["timeFrom"] = PLATFORM_TEST_CASE_PANEL_WINDOW
panels[-1]["fieldConfig"]["defaults"]["min"] = 0
panels[-1]["fieldConfig"]["defaults"]["custom"] = {
"drawStyle": "bars",
@ -4659,7 +4638,6 @@ def build_jobs_dashboard():
"test-case pass-rate rollups instead of raw historical scans."
),
)
selected_pass_rate_panel["timeFrom"] = PLATFORM_TEST_CASE_PANEL_WINDOW
selected_pass_rate_panel["links"] = jenkins_suite_links()
selected_pass_rate_panel["fieldConfig"]["defaults"]["links"] = jenkins_artifact_data_links()
panels.append(selected_pass_rate_panel)
@ -4675,7 +4653,6 @@ def build_jobs_dashboard():
"project; skipped tests are healthy, while failures and errors lower the lane."
),
)
category_pass_rate_panel["timeFrom"] = PLATFORM_TEST_HISTORY_WINDOW
category_pass_rate_panel["links"] = jenkins_suite_links()
panels.append(category_pass_rate_panel)
@ -4817,23 +4794,23 @@ def build_jobs_dashboard():
)
sonar_status_mix_panel["targets"][0]["legendFormat"] = "{{status}}"
panels.append(sonar_status_mix_panel)
sonar_gate_project_panel = state_timeline_panel(
35,
"Sonar Gate Health by Project",
f'{PLATFORM_TEST_SONAR_HEALTH_ROLLUP}{{project_key=~"{suite_var}"}}',
{"h": 6, "w": 8, "x": 16, "y": 88},
thresholds=success_thresholds,
unit="percent",
min_value=0,
max_value=100,
legend="{{project_key}}",
description=(
"SonarQube gate status over time by project. OK projects render as full healthy lanes; "
"non-OK projects drop to red without disappearing."
),
panels.append(
state_timeline_panel(
35,
"Sonar Gate Health by Project",
f'{PLATFORM_TEST_SONAR_HEALTH_ROLLUP}{{project_key=~"{suite_var}"}}',
{"h": 6, "w": 8, "x": 16, "y": 88},
thresholds=success_thresholds,
unit="percent",
min_value=0,
max_value=100,
legend="{{project_key}}",
description=(
"SonarQube gate status over time by project. OK projects render as full healthy lanes; "
"non-OK projects drop to red without disappearing."
),
)
)
sonar_gate_project_panel["timeFrom"] = PLATFORM_TEST_HISTORY_WINDOW
panels.append(sonar_gate_project_panel)
panels.append(
bargauge_panel(
148,
@ -4865,7 +4842,7 @@ def build_jobs_dashboard():
panels.append(
bargauge_panel(
149,
"Recent Branch Evidence by Suite (7d)",
"Recent Branch Evidence by Suite (30d)",
recent_branch_evidence,
{"h": 7, "w": 12, "x": 0, "y": 100},
unit="none",
@ -4880,7 +4857,7 @@ def build_jobs_dashboard():
panels.append(
bargauge_panel(
150,
"Primary Branch Clean by Suite (7d)",
"Primary Branch Clean by Suite (30d)",
primary_branch_clean_by_suite,
{"h": 7, "w": 12, "x": 12, "y": 100},
unit="percent",
@ -4992,7 +4969,7 @@ def build_jobs_dashboard():
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-24h", "to": "now"},
"time": {"from": "now-30d", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",

View File

@ -192,21 +192,19 @@ def test_render_configmap_writes(tmp_path):
assert f"{uid}.json" in content
def test_testing_suite_variable_uses_configured_suite_values_only():
def test_testing_suite_variable_uses_canonical_values_only():
mod = load_module()
variable = mod.testing_suite_variable()
configured_matcher = "|".join(mod.PLATFORM_TEST_SUITE_NAMES)
canonical_matcher = "|".join(mod.PLATFORM_TEST_SUITE_NAMES)
legacy_names = {"bstein-home", "data-prepper", "titan-iac", "pegasus-health"}
out_of_scope_names = {"arcanagon", "typhon"}
out_of_scope_names = {"arcanagon", "lesavka", "typhon"}
assert variable["allValue"] == configured_matcher
assert variable["allValue"] == canonical_matcher
assert not any(alias in variable["query"] for alias in legacy_names)
assert not any(alias in variable["allValue"] for alias in legacy_names)
assert not any(name in variable["query"] for name in out_of_scope_names)
assert not any(name in variable["allValue"] for name in out_of_scope_names)
assert [option["value"] for option in variable["options"]] == mod.PLATFORM_TEST_SUITE_NAMES
assert "lesavka" in variable["allValue"]
assert any(option["value"] == "lesavka" for option in variable["options"])
assert not any(
option["value"] in out_of_scope_names for option in variable["options"]
)
@ -468,3 +466,41 @@ def test_jobs_dashboard_collapses_heavy_drilldowns_for_light_first_paint():
assert "unless on(suite)" in branch_panel["targets"][0]["expr"]
assert "> bool 0" in branch_panel["targets"][0]["expr"]
assert branch_panel["targets"][0]["expr"].startswith("sort(")
def test_in_scope_jenkins_jobs_have_twice_daily_refresh_trigger():
casc = pathlib.Path("services/jenkins/configmap-jcasc.yaml").read_text()
in_scope_jobs = [
"ananke",
"ariadne",
"atlasbot",
"bstein-dev-home",
"data-prepper",
"metis",
"pegasus",
"soteria",
"titan-iac",
]
for job in in_scope_jobs:
block = casc.split(f"pipelineJob('{job}')", 1)[1].split("pipelineJob(", 1)[0]
assert "cron" in block
assert "spec('H H/12 * * *')" in block
def test_lesavka_jenkins_job_has_daily_refresh_trigger():
casc = pathlib.Path("services/jenkins/configmap-jcasc.yaml").read_text()
lesavka_block = casc.split("pipelineJob('lesavka')", 1)[1].split("pipelineJob(", 1)[0]
assert "scmpoll_spec('H/5 * * * *')" in lesavka_block
assert "cron" in lesavka_block
assert "spec('H H * * *')" in lesavka_block
def test_typhon_jenkins_job_has_daily_refresh_trigger():
casc = pathlib.Path("services/jenkins/configmap-jcasc.yaml").read_text()
typhon_block = casc.split("pipelineJob('typhon')", 1)[1].split("pipelineJob(", 1)[0]
assert "scmpoll_spec('H/5 * * * *')" in typhon_block
assert "cron" in typhon_block
assert "spec('H H * * *')" in typhon_block

View File

@ -1,41 +0,0 @@
import pathlib
def jcasc_job_block(job: str) -> str:
casc = pathlib.Path("services/jenkins/configmap-jcasc.yaml").read_text()
return casc.split(f"pipelineJob('{job}')", 1)[1].split("pipelineJob(", 1)[0]
def test_in_scope_jenkins_jobs_have_twice_daily_refresh_trigger():
in_scope_jobs = [
"ananke",
"ariadne",
"atlasbot",
"bstein-dev-home",
"data-prepper",
"metis",
"pegasus",
"soteria",
"titan-iac",
]
for job in in_scope_jobs:
block = jcasc_job_block(job)
assert "cron" in block
assert "spec('H H/12 * * *')" in block
def test_lesavka_jenkins_job_has_daily_refresh_trigger():
lesavka_block = jcasc_job_block("lesavka")
assert "scmpoll_spec('H/5 * * * *')" in lesavka_block
assert "cron" in lesavka_block
assert "spec('H H * * *')" in lesavka_block
def test_typhon_jenkins_job_has_daily_refresh_trigger():
typhon_block = jcasc_job_block("typhon")
assert "scmpoll_spec('H/5 * * * *')" in typhon_block
assert "cron" in typhon_block
assert "spec('H H * * *')" in typhon_block

View File

@ -20,9 +20,9 @@ resources:
- ingress.yaml
images:
- name: registry.bstein.dev/bstein/bstein-dev-home-frontend
newTag: 0.1.1-345 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
newTag: 0.1.1-314 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-frontend:tag"}
- name: registry.bstein.dev/bstein/bstein-dev-home-backend
newTag: 0.1.1-345 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
newTag: 0.1.1-314 # {"$imagepolicy": "bstein-dev-home:bstein-dev-home-backend:tag"}
configMapGenerator:
- name: chat-ai-gateway
namespace: bstein-dev-home

View File

@ -56,36 +56,6 @@ spec:
{{ with secret "kv/data/atlas/gitea/gitea-oidc" }}
{{ .Data.data.openid_auto_discovery_url }}
{{ end }}
vault.hashicorp.com/agent-inject-secret-gitea-veles__client_id: "kv/data/atlas/gitea/gitea-veles-oidc"
vault.hashicorp.com/agent-inject-template-gitea-veles__client_id: |
{{ with secret "kv/data/atlas/gitea/gitea-veles-oidc" }}
{{ .Data.data.client_id }}
{{ end }}
vault.hashicorp.com/agent-inject-secret-gitea-veles__client_secret: "kv/data/atlas/gitea/gitea-veles-oidc"
vault.hashicorp.com/agent-inject-template-gitea-veles__client_secret: |
{{ with secret "kv/data/atlas/gitea/gitea-veles-oidc" }}
{{ .Data.data.client_secret }}
{{ end }}
vault.hashicorp.com/agent-inject-secret-gitea-veles__discovery_url: "kv/data/atlas/gitea/gitea-veles-oidc"
vault.hashicorp.com/agent-inject-template-gitea-veles__discovery_url: |
{{ with secret "kv/data/atlas/gitea/gitea-veles-oidc" }}
{{ .Data.data.openid_auto_discovery_url }}
{{ end }}
vault.hashicorp.com/agent-inject-secret-gitea-veles__claim_name: "kv/data/atlas/gitea/gitea-veles-oidc"
vault.hashicorp.com/agent-inject-template-gitea-veles__claim_name: |
{{ with secret "kv/data/atlas/gitea/gitea-veles-oidc" }}
{{ .Data.data.required_claim_name }}
{{ end }}
vault.hashicorp.com/agent-inject-secret-gitea-veles__claim_value: "kv/data/atlas/gitea/gitea-veles-oidc"
vault.hashicorp.com/agent-inject-template-gitea-veles__claim_value: |
{{ with secret "kv/data/atlas/gitea/gitea-veles-oidc" }}
{{ .Data.data.required_claim_value }}
{{ end }}
vault.hashicorp.com/agent-inject-secret-gitea-veles__team_map: "kv/data/atlas/gitea/gitea-veles-oidc"
vault.hashicorp.com/agent-inject-template-gitea-veles__team_map: |
{{ with secret "kv/data/atlas/gitea/gitea-veles-oidc" }}
{{ .Data.data.group_team_map }}
{{ end }}
spec:
serviceAccountName: gitea-vault
initContainers:
@ -98,102 +68,49 @@ spec:
- /bin/sh
- -c
- |
set -eu
set -euo pipefail
CLIENT_ID="$(tr -d '\r\n' </vault/secrets/gitea-oidc__client_id)"
CLIENT_SECRET="$(tr -d '\r\n' </vault/secrets/gitea-oidc__client_secret)"
DISCOVERY_URL="$(tr -d '\r\n' </vault/secrets/gitea-oidc__openid_auto_discovery_url)"
APPINI=/data/gitea/conf/app.ini
BIN=/usr/local/bin/gitea
read_secret() {
path="$1"
if [ ! -r "$path" ]; then
echo "Missing readable Vault secret file: $path" >&2
return 1
fi
tr -d '\r\n' <"$path"
}
list="$($BIN -c "$APPINI" admin auth list)"
id=$(echo "$list" | awk '$2=="keycloak"{print $1}')
CLIENT_ID="$(read_secret /vault/secrets/gitea-oidc__client_id || true)"
CLIENT_SECRET="$(read_secret /vault/secrets/gitea-oidc__client_secret || true)"
DISCOVERY_URL="$(read_secret /vault/secrets/gitea-oidc__openid_auto_discovery_url || true)"
VELES_CLIENT_ID="$(read_secret /vault/secrets/gitea-veles__client_id || true)"
VELES_CLIENT_SECRET="$(read_secret /vault/secrets/gitea-veles__client_secret || true)"
VELES_DISCOVERY_URL="$(read_secret /vault/secrets/gitea-veles__discovery_url || true)"
VELES_REQUIRED_CLAIM_NAME="$(read_secret /vault/secrets/gitea-veles__claim_name || true)"
VELES_REQUIRED_CLAIM_VALUE="$(read_secret /vault/secrets/gitea-veles__claim_value || true)"
VELES_GROUP_TEAM_MAP="$(read_secret /vault/secrets/gitea-veles__team_map || true)"
if [ ! -r "$APPINI" ]; then
echo "Gitea app.ini is not readable yet; skipping OIDC source maintenance" >&2
exit 0
fi
if ! list="$($BIN -c "$APPINI" admin auth list)"; then
echo "Gitea auth source list failed; skipping OIDC source maintenance" >&2
exit 0
fi
ensure_oidc_source() {
source_name="$1"
source_scopes="$2"
source_client_id="$3"
source_client_secret="$4"
source_discovery_url="$5"
source_required_claim_name="$6"
source_required_claim_value="$7"
shift 7
id=$(echo "$list" | awk -v name="$source_name" '$2==name{print $1}')
if [ -n "$id" ]; then
echo "Updating auth source ${source_name} id=${id}"
if ! $BIN -c "$APPINI" admin auth update-oauth \
--id "$id" \
--name "$source_name" \
--provider openidConnect \
--key "$source_client_id" \
--secret "$source_client_secret" \
--auto-discover-url "$source_discovery_url" \
--scopes "$source_scopes" \
--required-claim-name "$source_required_claim_name" \
--required-claim-value "$source_required_claim_value" \
--group-claim-name groups \
--admin-group admin \
--skip-local-2fa "$@"; then
echo "OIDC update failed for ${source_name}; continuing without blocking startup" >&2
fi
else
echo "Creating auth source ${source_name}"
if ! $BIN -c "$APPINI" admin auth add-oauth \
--name "$source_name" \
--provider openidConnect \
--key "$source_client_id" \
--secret "$source_client_secret" \
--auto-discover-url "$source_discovery_url" \
--scopes "$source_scopes" \
--required-claim-name "$source_required_claim_name" \
--required-claim-value "$source_required_claim_value" \
--group-claim-name groups \
--admin-group admin \
--skip-local-2fa "$@"; then
echo "OIDC create failed for ${source_name}; continuing without blocking startup" >&2
fi
fi
}
if [ -n "$CLIENT_ID" ] && [ -n "$CLIENT_SECRET" ] && [ -n "$DISCOVERY_URL" ]; then
ensure_oidc_source keycloak "openid profile email groups" "$CLIENT_ID" "$CLIENT_SECRET" "$DISCOVERY_URL" "" ""
else
echo "Skipping keycloak auth source maintenance because atlas OIDC secret data is incomplete" >&2
fi
if [ -n "$VELES_CLIENT_ID" ] && [ -n "$VELES_CLIENT_SECRET" ] && [ -n "$VELES_DISCOVERY_URL" ] && [ -n "$VELES_REQUIRED_CLAIM_NAME" ] && [ -n "$VELES_REQUIRED_CLAIM_VALUE" ]; then
if [ -n "$VELES_GROUP_TEAM_MAP" ]; then
ensure_oidc_source veles "openid profile email groups" "$VELES_CLIENT_ID" "$VELES_CLIENT_SECRET" "$VELES_DISCOVERY_URL" "$VELES_REQUIRED_CLAIM_NAME" "$VELES_REQUIRED_CLAIM_VALUE" \
--restricted-group "$VELES_REQUIRED_CLAIM_VALUE" \
--group-team-map "$VELES_GROUP_TEAM_MAP"
else
ensure_oidc_source veles "openid profile email groups" "$VELES_CLIENT_ID" "$VELES_CLIENT_SECRET" "$VELES_DISCOVERY_URL" "$VELES_REQUIRED_CLAIM_NAME" "$VELES_REQUIRED_CLAIM_VALUE" \
--restricted-group "$VELES_REQUIRED_CLAIM_VALUE"
if [ -n "$id" ]; then
echo "Updating existing auth source id=$id"
if ! $BIN -c "$APPINI" admin auth update-oauth \
--id "$id" \
--name keycloak \
--provider openidConnect \
--key "$CLIENT_ID" \
--secret "$CLIENT_SECRET" \
--auto-discover-url "$DISCOVERY_URL" \
--scopes "openid profile email groups" \
--required-claim-name "" \
--required-claim-value "" \
--group-claim-name groups \
--admin-group admin \
--skip-local-2fa; then
echo "OIDC update failed; continuing without blocking startup" >&2
fi
else
echo "Skipping veles auth source maintenance because Veles OIDC secret data is incomplete" >&2
echo "Creating keycloak auth source"
if ! $BIN -c "$APPINI" admin auth add-oauth \
--name keycloak \
--provider openidConnect \
--key "$CLIENT_ID" \
--secret "$CLIENT_SECRET" \
--auto-discover-url "$DISCOVERY_URL" \
--scopes "openid profile email groups" \
--required-claim-name "" \
--required-claim-value "" \
--group-claim-name groups \
--admin-group admin \
--skip-local-2fa; then
echo "OIDC create failed; continuing without blocking startup" >&2
fi
fi
volumeMounts:
- name: gitea-data

View File

@ -5,14 +5,6 @@ resources:
- namespace.yaml
- serviceaccount.yaml
- pvc.yaml
- oneoffs/veles-feedback-acl-ensure-job.yaml
- deployment.yaml
- service.yaml
- ingress.yaml
configMapGenerator:
- name: veles-feedback-acl-ensure-script
namespace: gitea
files:
- scripts/veles_feedback_acl_ensure.sh
generatorOptions:
disableNameSuffixHash: true

View File

@ -1,51 +0,0 @@
# services/gitea/oneoffs/veles-feedback-acl-ensure-job.yaml
# One-off job for gitea/veles-feedback-acl-ensure-2.
# Purpose: keep Veles testers on the feedback repo without granting source access.
apiVersion: batch/v1
kind: Job
metadata:
name: veles-feedback-acl-ensure-2
namespace: gitea
spec:
suspend: false
backoffLimit: 0
template:
metadata:
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "gitea"
vault.hashicorp.com/agent-inject-secret-gitea-db-secret__password: "kv/data/atlas/gitea/gitea-db-secret"
vault.hashicorp.com/agent-inject-template-gitea-db-secret__password: |
{{ with secret "kv/data/atlas/gitea/gitea-db-secret" }}
{{ .Data.data.password }}
{{ end }}
spec:
serviceAccountName: gitea-vault
restartPolicy: Never
volumes:
- name: veles-feedback-acl-ensure-script
configMap:
name: veles-feedback-acl-ensure-script
defaultMode: 0555
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
- key: hardware
operator: In
values: ["rpi5"]
- key: node-role.kubernetes.io/worker
operator: Exists
containers:
- name: apply
image: postgres:15
command: ["/scripts/veles_feedback_acl_ensure.sh"]
volumeMounts:
- name: veles-feedback-acl-ensure-script
mountPath: /scripts
readOnly: true

View File

@ -1,90 +0,0 @@
#!/usr/bin/env sh
set -eu
db_host="${GITEA_DB_HOST:-postgres-service.postgres.svc.cluster.local}"
db_port="${GITEA_DB_PORT:-5432}"
db_name="${GITEA_DB_NAME:-gitea}"
db_user="${GITEA_DB_USER:-gitea}"
org_name="${VELES_GITEA_ORG:-veles-alpha}"
repo_name="${VELES_GITEA_FEEDBACK_REPO:-feedback}"
team_name="${VELES_GITEA_TESTER_TEAM:-testers}"
if [ ! -r /vault/secrets/gitea-db-secret__password ]; then
echo "Missing readable Vault secret file: /vault/secrets/gitea-db-secret__password" >&2
exit 1
fi
export PGPASSWORD
PGPASSWORD="$(tr -d '\r\n' </vault/secrets/gitea-db-secret__password)"
psql_base="psql -h ${db_host} -p ${db_port} -U ${db_user} -d ${db_name} -v ON_ERROR_STOP=1 -P pager=off"
${psql_base} \
-v org_name="${org_name}" \
-v repo_name="${repo_name}" \
-v team_name="${team_name}" <<'SQL'
begin;
create temporary table veles_acl_ids on commit drop as
select
org.id as org_id,
repo.id as repo_id,
team.id as team_id
from gitea."user" org
join gitea.repository repo
on repo.owner_id = org.id
join gitea.team team
on team.org_id = org.id
where org.lower_name = lower(:'org_name')
and org.type = 1
and repo.lower_name = lower(:'repo_name')
and team.lower_name = lower(:'team_name');
do $$
begin
if (select count(*) from veles_acl_ids) != 1 then
raise exception 'Expected one veles feedback ACL target, found %', (select count(*) from veles_acl_ids);
end if;
end $$;
update gitea.team team
set authorize = 1,
includes_all_repositories = true,
can_create_org_repo = false
from veles_acl_ids ids
where team.id = ids.team_id;
insert into gitea.team_repo (org_id, team_id, repo_id)
select ids.org_id, ids.team_id, ids.repo_id
from veles_acl_ids ids
where not exists (
select 1
from gitea.team_repo existing
where existing.team_id = ids.team_id
and existing.repo_id = ids.repo_id
);
delete from gitea.team_unit unit
using veles_acl_ids ids
where unit.team_id = ids.team_id
and unit.type in (1, 2, 3, 4, 5, 8, 9, 10);
insert into gitea.team_unit (org_id, team_id, type, access_mode)
select ids.org_id, ids.team_id, desired.type, desired.access_mode
from veles_acl_ids ids
cross join (
values
(1, 0),
(2, 2),
(3, 0),
(4, 0),
(5, 0),
(8, 0),
(9, 0),
(10, 0)
) as desired(type, access_mode);
commit;
SQL
echo "Veles feedback Gitea ACL ready"

View File

@ -429,24 +429,6 @@ data:
}
}
}
pipelineJob('veles') {
disabled(false)
description('Staged Veles alpha image pipeline. Backend/frontend should build linux/amd64 and linux/arm64; sim-worker may begin amd64-only if Forge dependencies require it.')
definition {
cpsScm {
scm {
git {
remote {
url('https://scm.bstein.dev/bstein/veles.git')
credentials('gitea-pat')
}
branches('*/main')
}
}
scriptPath('Jenkinsfile')
}
}
}
multibranchPipelineJob('titan-iac-quality-gate') {
branchSources {
branchSource {
@ -490,15 +472,6 @@ data:
projectNamingStrategy: "standard"
markupFormatter:
plainText
globalNodeProperties:
- envVars:
env:
- key: "GIT_CONFIG_COUNT"
value: "1"
- key: "GIT_CONFIG_KEY_0"
value: "safe.directory"
- key: "GIT_CONFIG_VALUE_0"
value: "*"
clouds:
- kubernetes:
containerCapStr: "5"
@ -550,11 +523,6 @@ data:
slaveConnectTimeoutStr: "100"
yaml: |
spec:
securityContext:
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
fsGroupChangePolicy: "OnRootMismatch"
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"

View File

@ -155,7 +155,7 @@ spec:
containerPort: 50000
env:
- name: JAVA_OPTS
value: "-Xms512m -Xmx2048m -Duser.timezone=America/Chicago -Dkubernetes.websocket.timeout=60000 -Dorg.csanchez.jenkins.plugins.kubernetes.pipeline.ContainerExecDecorator.websocketConnectionTimeout=120"
value: "-Xms512m -Xmx2048m -Duser.timezone=America/Chicago"
- name: TZ
value: "America/Chicago"
- name: JENKINS_OPTS

View File

@ -80,7 +80,8 @@ spec:
command: ["/bin/sh", "-c"]
args:
- |
:
cp /plugin/mailu-http-listener-0.1.0.jar /providers/
cp -r /plugin/src /providers/src
volumeMounts:
- name: providers
mountPath: /providers
@ -123,7 +124,7 @@ spec:
- name: KC_METRICS_ENABLED
value: "true"
- name: KC_EVENTS_LISTENERS
value: jboss-logging
value: jboss-logging,mailu-http
- name: KC_SPI_EVENTS_LISTENER_MAILU-HTTP_ENDPOINT
value: http://ariadne.maintenance.svc.cluster.local/events
ports:
@ -131,13 +132,6 @@ spec:
name: http
- containerPort: 9000
name: metrics
resources:
requests:
cpu: 100m
memory: 512Mi
limits:
cpu: "1"
memory: 2Gi
readinessProbe:
httpGet:
path: /health/ready
@ -149,7 +143,7 @@ spec:
httpGet:
path: /health/live
port: 9000
initialDelaySeconds: 600
initialDelaySeconds: 60
periodSeconds: 15
failureThreshold: 6
volumeMounts:

View File

@ -27,8 +27,6 @@ resources:
- oneoffs/soteria-oidc-secret-ensure-job.yaml
- oneoffs/quality-oidc-secret-ensure-job.yaml
- oneoffs/agent-oidc-secret-ensure-job.yaml
- oneoffs/veles-realm-ensure-job.yaml
- oneoffs/veles-gitea-oidc-secret-ensure-job.yaml
- oneoffs/metis-ssh-keys-secret-ensure-job.yaml
- oneoffs/metis-node-passwords-secret-ensure-job.yaml
- oneoffs/harbor-oidc-secret-ensure-job.yaml
@ -55,6 +53,3 @@ configMapGenerator:
- name: agent-oidc-secret-ensure-script
files:
- agent_oidc_secret_ensure.sh=scripts/agent_oidc_secret_ensure.sh
- name: veles-gitea-oidc-secret-ensure-script
files:
- veles_gitea_oidc_secret_ensure.sh=scripts/veles_gitea_oidc_secret_ensure.sh

View File

@ -1,53 +0,0 @@
# services/keycloak/oneoffs/veles-gitea-oidc-secret-ensure-job.yaml
# One-off job for sso/veles-gitea-oidc-secret-ensure-5.
# Purpose: create/update the Veles realm Gitea OIDC client and write the
# matching Gitea auth-source secret to Vault.
# Keep suspended until the Vault policy change has reconciled, then unsuspend once.
apiVersion: batch/v1
kind: Job
metadata:
name: veles-gitea-oidc-secret-ensure-5
namespace: sso
spec:
suspend: true
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
metadata:
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "sso-secrets"
vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin"
vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: |
{{ with secret "kv/data/atlas/shared/keycloak-admin" }}
export KEYCLOAK_ADMIN="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_USER="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_PASSWORD="{{ .Data.data.password }}"
{{ end }}
spec:
serviceAccountName: mas-secrets-ensure
restartPolicy: Never
volumes:
- name: veles-gitea-oidc-secret-ensure-script
configMap:
name: veles-gitea-oidc-secret-ensure-script
defaultMode: 0555
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
- key: node-role.kubernetes.io/worker
operator: Exists
containers:
- name: apply
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/scripts/veles_gitea_oidc_secret_ensure.sh"]
volumeMounts:
- name: veles-gitea-oidc-secret-ensure-script
mountPath: /scripts
readOnly: true

View File

@ -1,397 +0,0 @@
# services/keycloak/oneoffs/veles-realm-ensure-job.yaml
# One-off job for sso/veles-realm-ensure-4.
# Purpose: create the Veles realm, groups, OIDC client, SMTP settings, and Vault client secret.
# Keep suspended until Veles Vault paths/policies have reconciled, then unsuspend once.
apiVersion: batch/v1
kind: Job
metadata:
name: veles-realm-ensure-4
namespace: sso
spec:
suspend: true
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
metadata:
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "sso-secrets"
vault.hashicorp.com/agent-inject-secret-keycloak-admin-env.sh: "kv/data/atlas/shared/keycloak-admin"
vault.hashicorp.com/agent-inject-template-keycloak-admin-env.sh: |
{{ with secret "kv/data/atlas/shared/keycloak-admin" }}
export KEYCLOAK_ADMIN="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_USER="{{ .Data.data.username }}"
export KEYCLOAK_ADMIN_PASSWORD="{{ .Data.data.password }}"
{{ end }}
{{ with secret "kv/data/atlas/shared/postmark-relay" }}
export KEYCLOAK_SMTP_USER="{{ index .Data.data "apikey" }}"
export KEYCLOAK_SMTP_PASSWORD="{{ index .Data.data "apikey" }}"
{{ end }}
spec:
serviceAccountName: mas-secrets-ensure
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: configure
image: python:3.11-alpine
env:
- name: KEYCLOAK_SERVER
value: http://keycloak.sso.svc.cluster.local
- name: KEYCLOAK_REALM
value: veles
- name: KEYCLOAK_CLIENT_ID
value: veles-web
- name: KEYCLOAK_PUBLIC_ISSUER
value: https://sso.bstein.dev/realms/veles
- name: VELES_BASE_URL
value: https://veles.bstein.dev
- name: KEYCLOAK_SMTP_HOST
value: mail.bstein.dev
- name: KEYCLOAK_SMTP_PORT
value: "587"
- name: KEYCLOAK_SMTP_FROM
value: no-reply-veles@bstein.dev
- name: KEYCLOAK_SMTP_FROM_NAME
value: Veles
command: ["/bin/sh", "-c"]
args:
- |
set -eu
. /vault/secrets/keycloak-admin-env.sh
python - <<'PY'
import json
import os
import time
import urllib.error
import urllib.parse
import urllib.request
base_url = os.environ["KEYCLOAK_SERVER"].rstrip("/")
realm = os.environ["KEYCLOAK_REALM"]
client_id = os.environ["KEYCLOAK_CLIENT_ID"]
issuer = os.environ["KEYCLOAK_PUBLIC_ISSUER"]
veles_base_url = os.environ["VELES_BASE_URL"].rstrip("/")
admin_user = os.environ["KEYCLOAK_ADMIN_USER"]
admin_password = os.environ["KEYCLOAK_ADMIN_PASSWORD"]
def request(method, url, token=None, payload=None, headers=None, timeout=30):
data = None
req_headers = headers.copy() if headers else {}
if token:
req_headers["Authorization"] = f"Bearer {token}"
if payload is not None:
data = json.dumps(payload).encode()
req_headers["Content-Type"] = "application/json"
req = urllib.request.Request(url, data=data, headers=req_headers, method=method)
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
body = resp.read()
if not body:
return resp.status, None
return resp.status, json.loads(body.decode())
except urllib.error.HTTPError as exc:
raw = exc.read()
if not raw:
return exc.code, None
try:
return exc.code, json.loads(raw.decode())
except Exception:
return exc.code, {"raw": raw.decode(errors="replace")}
token_body = None
form = urllib.parse.urlencode(
{
"grant_type": "password",
"client_id": "admin-cli",
"username": admin_user,
"password": admin_password,
}
).encode()
for attempt in range(1, 11):
req = urllib.request.Request(
f"{base_url}/realms/master/protocol/openid-connect/token",
data=form,
headers={"Content-Type": "application/x-www-form-urlencoded"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=10) as resp:
token_body = json.loads(resp.read().decode())
break
except urllib.error.URLError as exc:
if attempt == 10:
raise SystemExit(f"Keycloak token request failed after retries: {exc}")
time.sleep(attempt * 2)
token = token_body["access_token"]
smtp = {
"host": os.environ["KEYCLOAK_SMTP_HOST"],
"port": os.environ["KEYCLOAK_SMTP_PORT"],
"from": os.environ["KEYCLOAK_SMTP_FROM"],
"fromDisplayName": os.environ["KEYCLOAK_SMTP_FROM_NAME"],
"replyTo": os.environ["KEYCLOAK_SMTP_FROM"],
"replyToDisplayName": os.environ["KEYCLOAK_SMTP_FROM_NAME"],
"user": os.environ["KEYCLOAK_SMTP_USER"],
"password": os.environ["KEYCLOAK_SMTP_PASSWORD"],
"auth": "true",
"starttls": "true",
"ssl": "false",
}
status, realm_rep = request("GET", f"{base_url}/admin/realms/{realm}", token)
if status == 404:
create_payload = {
"realm": realm,
"enabled": True,
"registrationAllowed": True,
"resetPasswordAllowed": True,
"verifyEmail": True,
"loginWithEmailAllowed": True,
"duplicateEmailsAllowed": False,
"smtpServer": smtp,
}
status, body = request("POST", f"{base_url}/admin/realms", token, create_payload)
if status not in (201, 204, 409):
raise SystemExit(f"Realm create failed: status={status} body={body}")
status, realm_rep = request("GET", f"{base_url}/admin/realms/{realm}", token)
if status != 200 or not isinstance(realm_rep, dict):
raise SystemExit(f"Realm fetch failed: status={status}")
realm_rep.update(
{
"enabled": True,
"registrationAllowed": True,
"resetPasswordAllowed": True,
"verifyEmail": True,
"loginWithEmailAllowed": True,
"duplicateEmailsAllowed": False,
"smtpServer": smtp,
}
)
status, body = request("PUT", f"{base_url}/admin/realms/{realm}", token, realm_rep)
if status not in (200, 204):
raise SystemExit(f"Realm update failed: status={status} body={body}")
def ensure_group(name):
status, groups = request(
"GET",
f"{base_url}/admin/realms/{realm}/groups?search={urllib.parse.quote(name)}",
token,
)
if status != 200:
raise SystemExit(f"Group search failed for {name}: status={status}")
for group in groups or []:
if group.get("name") == name:
return group["id"]
status, body = request("POST", f"{base_url}/admin/realms/{realm}/groups", token, {"name": name})
if status not in (201, 204, 409):
raise SystemExit(f"Group create failed for {name}: status={status} body={body}")
status, groups = request(
"GET",
f"{base_url}/admin/realms/{realm}/groups?search={urllib.parse.quote(name)}",
token,
)
if status != 200:
raise SystemExit(f"Group lookup failed after create for {name}: status={status}")
for group in groups or []:
if group.get("name") == name:
return group["id"]
raise SystemExit(f"Group {name} not found after create")
def ensure_role(name):
status, role = request("GET", f"{base_url}/admin/realms/{realm}/roles/{urllib.parse.quote(name)}", token)
if status == 404:
status, body = request("POST", f"{base_url}/admin/realms/{realm}/roles", token, {"name": name})
if status not in (201, 204, 409):
raise SystemExit(f"Role create failed for {name}: status={status} body={body}")
status, role = request(
"GET",
f"{base_url}/admin/realms/{realm}/roles/{urllib.parse.quote(name)}",
token,
)
if status != 200 or not isinstance(role, dict):
raise SystemExit(f"Role lookup failed for {name}: status={status}")
return role
def ensure_group_role(group_id, role):
status, mappings = request(
"GET",
f"{base_url}/admin/realms/{realm}/groups/{group_id}/role-mappings/realm",
token,
)
if status != 200:
raise SystemExit(f"Group role mapping lookup failed: status={status}")
if any(mapping.get("name") == role["name"] for mapping in mappings or []):
return
status, body = request(
"POST",
f"{base_url}/admin/realms/{realm}/groups/{group_id}/role-mappings/realm",
token,
[role],
)
if status not in (200, 204):
raise SystemExit(f"Group role mapping failed for {role['name']}: status={status} body={body}")
def ensure_default_group(group_id, name):
status, groups = request("GET", f"{base_url}/admin/realms/{realm}/default-groups", token)
if status != 200:
raise SystemExit(f"Default group lookup failed: status={status}")
for group in groups or []:
if group.get("id") == group_id or group.get("name") == name:
return
status, body = request("PUT", f"{base_url}/admin/realms/{realm}/default-groups/{group_id}", token)
if status not in (200, 204):
raise SystemExit(f"Default group update failed for {name}: status={status} body={body}")
alpha_group_id = ensure_group("alpha")
admin_group_id = ensure_group("admin")
alpha_role = ensure_role("alpha")
admin_role = ensure_role("admin")
ensure_group_role(alpha_group_id, alpha_role)
ensure_group_role(admin_group_id, alpha_role)
ensure_group_role(admin_group_id, admin_role)
ensure_default_group(alpha_group_id, "alpha")
status, clients = request(
"GET",
f"{base_url}/admin/realms/{realm}/clients?clientId={urllib.parse.quote(client_id)}",
token,
)
if status != 200:
raise SystemExit(f"Client lookup failed: status={status}")
client_uuid = clients[0]["id"] if clients else None
client_payload = {
"clientId": client_id,
"enabled": True,
"protocol": "openid-connect",
"publicClient": False,
"standardFlowEnabled": True,
"implicitFlowEnabled": False,
"directAccessGrantsEnabled": False,
"serviceAccountsEnabled": False,
"redirectUris": [f"{veles_base_url}/*"],
"webOrigins": [veles_base_url],
"rootUrl": veles_base_url,
"baseUrl": "/",
"attributes": {
"pkce.code.challenge.method": "S256",
"post.logout.redirect.uris": f"{veles_base_url}/*",
},
}
if not client_uuid:
status, body = request("POST", f"{base_url}/admin/realms/{realm}/clients", token, client_payload)
if status not in (201, 204, 409):
raise SystemExit(f"Client create failed: status={status} body={body}")
status, clients = request(
"GET",
f"{base_url}/admin/realms/{realm}/clients?clientId={urllib.parse.quote(client_id)}",
token,
)
client_uuid = clients[0]["id"] if clients else None
if not client_uuid:
raise SystemExit("Client veles-web not found after create")
status, body = request(
"PUT",
f"{base_url}/admin/realms/{realm}/clients/{client_uuid}",
token,
client_payload,
)
if status not in (200, 204):
raise SystemExit(f"Client update failed: status={status} body={body}")
mapper_payload = {
"name": "groups",
"protocol": "openid-connect",
"protocolMapper": "oidc-group-membership-mapper",
"consentRequired": False,
"config": {
"full.path": "false",
"id.token.claim": "true",
"access.token.claim": "true",
"userinfo.token.claim": "true",
"claim.name": "groups",
"jsonType.label": "String",
},
}
status, mappers = request(
"GET",
f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/protocol-mappers/models",
token,
)
if status != 200:
raise SystemExit(f"Mapper lookup failed: status={status}")
mapper_id = next((mapper.get("id") for mapper in mappers or [] if mapper.get("name") == "groups"), None)
if mapper_id:
mapper_payload["id"] = mapper_id
status, body = request(
"PUT",
f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/protocol-mappers/models/{mapper_id}",
token,
mapper_payload,
)
else:
status, body = request(
"POST",
f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/protocol-mappers/models",
token,
mapper_payload,
)
if status not in (200, 201, 204):
raise SystemExit(f"Mapper ensure failed: status={status} body={body}")
status, secret = request(
"GET",
f"{base_url}/admin/realms/{realm}/clients/{client_uuid}/client-secret",
token,
)
client_secret = (secret or {}).get("value")
if status != 200 or not client_secret:
raise SystemExit(f"Client secret fetch failed: status={status}")
vault_addr = os.environ.get("VAULT_ADDR", "http://vault.vault.svc.cluster.local:8200")
jwt = open("/var/run/secrets/kubernetes.io/serviceaccount/token", encoding="utf-8").read().strip()
login_payload = json.dumps({"jwt": jwt, "role": os.environ.get("VAULT_ROLE", "sso-secrets")}).encode()
req = urllib.request.Request(
f"{vault_addr}/v1/auth/kubernetes/login",
data=login_payload,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=20) as resp:
vault_token = json.loads(resp.read().decode())["auth"]["client_token"]
payload = {
"data": {
"client_id": client_id,
"client_secret": client_secret,
"issuer": issuer,
"realm": realm,
"required_groups": "alpha,admin",
}
}
req = urllib.request.Request(
f"{vault_addr}/v1/kv/data/atlas/veles/veles-oidc",
data=json.dumps(payload).encode(),
headers={"X-Vault-Token": vault_token, "Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=20) as resp:
if resp.status not in (200, 204):
raise SystemExit(f"Vault write returned {resp.status}")
print("Veles Keycloak realm/client ready")
PY

View File

@ -1,330 +0,0 @@
#!/usr/bin/env sh
set -euo pipefail
. /vault/secrets/keycloak-admin-env.sh
KC_URL="${KEYCLOAK_SERVER:-http://keycloak.sso.svc.cluster.local}"
REALM="${KEYCLOAK_REALM:-veles}"
CLIENT_ID="${KEYCLOAK_CLIENT_ID:-gitea}"
PUBLIC_BASE_URL="${GITEA_PUBLIC_BASE_URL:-https://scm.bstein.dev}"
AUTH_SOURCE_NAME="${GITEA_AUTH_SOURCE_NAME:-veles}"
TESTER_GROUP="${VELES_GITEA_TESTER_GROUP:-veles-tester}"
VAULT_SECRET_PATH="${VAULT_SECRET_PATH:-gitea/gitea-veles-oidc}"
GROUP_TEAM_MAP="${GITEA_GROUP_TEAM_MAP:-{\"veles-tester\":{\"veles-alpha\":[\"testers\"]}}}"
ACCESS_TOKEN=""
for attempt in 1 2 3 4 5 6 7 8 9 10; do
if curl -fsS "${KC_URL}/realms/master" >/dev/null 2>&1; then
break
fi
echo "Waiting for Keycloak to be reachable (attempt ${attempt})" >&2
sleep $((attempt * 2))
done
for attempt in 1 2 3 4 5; do
TOKEN_JSON="$(curl -sS -X POST "$KC_URL/realms/master/protocol/openid-connect/token" \
-H 'Content-Type: application/x-www-form-urlencoded' \
-d "grant_type=password" \
-d "client_id=admin-cli" \
-d "username=${KEYCLOAK_ADMIN}" \
-d "password=${KEYCLOAK_ADMIN_PASSWORD}" || true)"
ACCESS_TOKEN="$(echo "$TOKEN_JSON" | jq -r '.access_token' 2>/dev/null || true)"
if [ -n "$ACCESS_TOKEN" ] && [ "$ACCESS_TOKEN" != "null" ]; then
break
fi
echo "Keycloak token request failed (attempt ${attempt})" >&2
sleep $((attempt * 2))
done
if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then
echo "Failed to fetch Keycloak admin token" >&2
exit 1
fi
ensure_group() {
group_name="$1"
groups="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/groups?search=$(printf '%s' "${group_name}" | jq -sRr @uri)" || true)"
group_id="$(echo "$groups" | jq -r --arg name "$group_name" '.[]? | select(.name == $name) | .id' | head -n1 || true)"
if [ -n "$group_id" ] && [ "$group_id" != "null" ]; then
printf '%s' "$group_id"
return
fi
status="$(curl -sS -o /tmp/keycloak-group-create.json -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "$(jq -nc --arg name "$group_name" '{name:$name}')" \
"${KC_URL}/admin/realms/${REALM}/groups")"
if [ "$status" != "201" ] && [ "$status" != "204" ] && [ "$status" != "409" ]; then
echo "Keycloak group create failed for ${group_name} (status ${status})" >&2
cat /tmp/keycloak-group-create.json >&2 || true
exit 1
fi
groups="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/groups?search=$(printf '%s' "${group_name}" | jq -sRr @uri)" || true)"
group_id="$(echo "$groups" | jq -r --arg name "$group_name" '.[]? | select(.name == $name) | .id' | head -n1 || true)"
if [ -z "$group_id" ] || [ "$group_id" = "null" ]; then
echo "Keycloak group ${group_name} not found after create" >&2
exit 1
fi
printf '%s' "$group_id"
}
ensure_role() {
role_name="$1"
role="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/roles/$(printf '%s' "${role_name}" | jq -sRr @uri)" || true)"
if echo "$role" | jq -e --arg name "$role_name" '.name == $name' >/dev/null 2>&1; then
printf '%s' "$role"
return
fi
status="$(curl -sS -o /tmp/keycloak-role-create.json -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "$(jq -nc --arg name "$role_name" '{name:$name}')" \
"${KC_URL}/admin/realms/${REALM}/roles")"
if [ "$status" != "201" ] && [ "$status" != "204" ] && [ "$status" != "409" ]; then
echo "Keycloak role create failed for ${role_name} (status ${status})" >&2
cat /tmp/keycloak-role-create.json >&2 || true
exit 1
fi
role="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/roles/$(printf '%s' "${role_name}" | jq -sRr @uri)" || true)"
if ! echo "$role" | jq -e --arg name "$role_name" '.name == $name' >/dev/null 2>&1; then
echo "Keycloak role ${role_name} not found after create" >&2
exit 1
fi
printf '%s' "$role"
}
ensure_group_role() {
group_id="$1"
role_json="$2"
role_name="$(echo "$role_json" | jq -r '.name')"
mappings="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/groups/${group_id}/role-mappings/realm" || true)"
if echo "$mappings" | jq -e --arg name "$role_name" '.[]? | select(.name == $name)' >/dev/null 2>&1; then
return
fi
status="$(curl -sS -o /tmp/keycloak-group-role.json -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "[$role_json]" \
"${KC_URL}/admin/realms/${REALM}/groups/${group_id}/role-mappings/realm")"
if [ "$status" != "200" ] && [ "$status" != "204" ]; then
echo "Keycloak group role mapping failed for ${role_name} (status ${status})" >&2
cat /tmp/keycloak-group-role.json >&2 || true
exit 1
fi
}
ensure_mapper() {
client_uuid="$1"
mapper_name="$2"
mapper_payload="$3"
mappers="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/clients/${client_uuid}/protocol-mappers/models" || true)"
mapper_id="$(echo "$mappers" | jq -r --arg name "$mapper_name" '.[]? | select(.name == $name) | .id' | head -n1 || true)"
if [ -n "$mapper_id" ] && [ "$mapper_id" != "null" ]; then
mapper_payload="$(echo "$mapper_payload" | jq --arg id "$mapper_id" '. + {id:$id}')"
status="$(curl -sS -o /tmp/keycloak-mapper.json -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${mapper_payload}" \
"${KC_URL}/admin/realms/${REALM}/clients/${client_uuid}/protocol-mappers/models/${mapper_id}")"
else
status="$(curl -sS -o /tmp/keycloak-mapper.json -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${mapper_payload}" \
"${KC_URL}/admin/realms/${REALM}/clients/${client_uuid}/protocol-mappers/models")"
fi
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
echo "Keycloak mapper ensure failed for ${mapper_name} (status ${status})" >&2
cat /tmp/keycloak-mapper.json >&2 || true
exit 1
fi
}
ensure_client_scope() {
scope_name="$1"
scopes="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/client-scopes?search=$(printf '%s' "${scope_name}" | jq -sRr @uri)" || true)"
scope_id="$(echo "$scopes" | jq -r --arg name "$scope_name" '.[]? | select(.name == $name) | .id' | head -n1 || true)"
if [ -n "$scope_id" ] && [ "$scope_id" != "null" ]; then
printf '%s' "$scope_id"
return
fi
scope_payload="$(jq -nc --arg name "$scope_name" '{name:$name,protocol:"openid-connect",attributes:{"include.in.token.scope":"true","display.on.consent.screen":"false"}}')"
status="$(curl -sS -o /tmp/keycloak-client-scope-create.json -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${scope_payload}" \
"${KC_URL}/admin/realms/${REALM}/client-scopes")"
if [ "$status" != "201" ] && [ "$status" != "204" ] && [ "$status" != "409" ]; then
echo "Keycloak client scope create failed for ${scope_name} (status ${status})" >&2
cat /tmp/keycloak-client-scope-create.json >&2 || true
exit 1
fi
scopes="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/client-scopes?search=$(printf '%s' "${scope_name}" | jq -sRr @uri)" || true)"
scope_id="$(echo "$scopes" | jq -r --arg name "$scope_name" '.[]? | select(.name == $name) | .id' | head -n1 || true)"
if [ -z "$scope_id" ] || [ "$scope_id" = "null" ]; then
echo "Keycloak client scope ${scope_name} not found after create" >&2
exit 1
fi
printf '%s' "$scope_id"
}
ensure_scope_mapper() {
scope_id="$1"
mapper_name="$2"
mapper_payload="$3"
mappers="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/client-scopes/${scope_id}/protocol-mappers/models" || true)"
mapper_id="$(echo "$mappers" | jq -r --arg name "$mapper_name" '.[]? | select(.name == $name) | .id' | head -n1 || true)"
if [ -n "$mapper_id" ] && [ "$mapper_id" != "null" ]; then
mapper_payload="$(echo "$mapper_payload" | jq --arg id "$mapper_id" '. + {id:$id}')"
status="$(curl -sS -o /tmp/keycloak-scope-mapper.json -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${mapper_payload}" \
"${KC_URL}/admin/realms/${REALM}/client-scopes/${scope_id}/protocol-mappers/models/${mapper_id}")"
else
status="$(curl -sS -o /tmp/keycloak-scope-mapper.json -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${mapper_payload}" \
"${KC_URL}/admin/realms/${REALM}/client-scopes/${scope_id}/protocol-mappers/models")"
fi
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
echo "Keycloak client-scope mapper ensure failed for ${mapper_name} (status ${status})" >&2
cat /tmp/keycloak-scope-mapper.json >&2 || true
exit 1
fi
}
ensure_client_optional_scope() {
client_uuid="$1"
scope_id="$2"
scope_name="$3"
default_scopes="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/clients/${client_uuid}/default-client-scopes" || true)"
optional_scopes="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/clients/${client_uuid}/optional-client-scopes" || true)"
if echo "$default_scopes" | jq -e --arg name "$scope_name" '.[]? | select(.name == $name)' >/dev/null 2>&1 \
|| echo "$optional_scopes" | jq -e --arg name "$scope_name" '.[]? | select(.name == $name)' >/dev/null 2>&1; then
return
fi
status="$(curl -sS -o /dev/null -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/clients/${client_uuid}/optional-client-scopes/${scope_id}")"
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
status="$(curl -sS -o /dev/null -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/clients/${client_uuid}/optional-client-scopes/${scope_id}")"
if [ "$status" != "200" ] && [ "$status" != "201" ] && [ "$status" != "204" ]; then
echo "Failed to attach ${scope_name} client scope to ${CLIENT_ID} (status ${status})" >&2
exit 1
fi
fi
}
TESTER_GROUP_ID="$(ensure_group "${TESTER_GROUP}")"
TESTER_ROLE="$(ensure_role "${TESTER_GROUP}")"
ensure_group_role "${TESTER_GROUP_ID}" "${TESTER_ROLE}"
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/clients?clientId=$(printf '%s' "${CLIENT_ID}" | jq -sRr @uri)" || true)"
CLIENT_UUID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
client_payload="$(jq -nc \
--arg client_id "${CLIENT_ID}" \
--arg root_url "${PUBLIC_BASE_URL}" \
--arg callback "${PUBLIC_BASE_URL}/user/oauth2/${AUTH_SOURCE_NAME}/callback" \
'{clientId:$client_id,enabled:true,protocol:"openid-connect",publicClient:false,standardFlowEnabled:true,implicitFlowEnabled:false,directAccessGrantsEnabled:false,serviceAccountsEnabled:false,redirectUris:[$callback],webOrigins:[$root_url],rootUrl:$root_url,baseUrl:"/",attributes:{"pkce.code.challenge.method":"","post.logout.redirect.uris":($root_url + "/*")}}')"
if [ -z "$CLIENT_UUID" ] || [ "$CLIENT_UUID" = "null" ]; then
status="$(curl -sS -o /tmp/keycloak-client-create.json -w "%{http_code}" -X POST \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${client_payload}" \
"${KC_URL}/admin/realms/${REALM}/clients")"
if [ "$status" != "201" ] && [ "$status" != "204" ] && [ "$status" != "409" ]; then
echo "Keycloak client create failed for ${CLIENT_ID} (status ${status})" >&2
cat /tmp/keycloak-client-create.json >&2 || true
exit 1
fi
CLIENT_QUERY="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/clients?clientId=$(printf '%s' "${CLIENT_ID}" | jq -sRr @uri)" || true)"
CLIENT_UUID="$(echo "$CLIENT_QUERY" | jq -r '.[0].id' 2>/dev/null || true)"
fi
if [ -z "$CLIENT_UUID" ] || [ "$CLIENT_UUID" = "null" ]; then
echo "Keycloak client ${CLIENT_ID} not found after create" >&2
exit 1
fi
status="$(curl -sS -o /tmp/keycloak-client-update.json -w "%{http_code}" -X PUT \
-H "Authorization: Bearer ${ACCESS_TOKEN}" \
-H 'Content-Type: application/json' \
-d "${client_payload}" \
"${KC_URL}/admin/realms/${REALM}/clients/${CLIENT_UUID}")"
if [ "$status" != "200" ] && [ "$status" != "204" ]; then
echo "Keycloak client update failed for ${CLIENT_ID} (status ${status})" >&2
cat /tmp/keycloak-client-update.json >&2 || true
exit 1
fi
groups_mapper_payload="$(jq -nc \
'{name:"groups",protocol:"openid-connect",protocolMapper:"oidc-group-membership-mapper",consentRequired:false,config:{"full.path":"false","id.token.claim":"true","access.token.claim":"true","userinfo.token.claim":"true","claim.name":"groups","jsonType.label":"String"}}')"
roles_mapper_payload="$(jq -nc \
'{name:"roles",protocol:"openid-connect",protocolMapper:"oidc-usermodel-realm-role-mapper",consentRequired:false,config:{"multivalued":"true","id.token.claim":"true","access.token.claim":"true","userinfo.token.claim":"true","claim.name":"roles","jsonType.label":"String"}}')"
ensure_mapper "${CLIENT_UUID}" groups "${groups_mapper_payload}"
ensure_mapper "${CLIENT_UUID}" roles "${roles_mapper_payload}"
GROUPS_SCOPE_ID="$(ensure_client_scope groups)"
ensure_scope_mapper "${GROUPS_SCOPE_ID}" groups "${groups_mapper_payload}"
ensure_client_optional_scope "${CLIENT_UUID}" "${GROUPS_SCOPE_ID}" groups
CLIENT_SECRET="$(curl -sS -H "Authorization: Bearer ${ACCESS_TOKEN}" \
"${KC_URL}/admin/realms/${REALM}/clients/${CLIENT_UUID}/client-secret" | jq -r '.value' 2>/dev/null || true)"
if [ -z "$CLIENT_SECRET" ] || [ "$CLIENT_SECRET" = "null" ]; then
echo "Keycloak client secret not found for ${CLIENT_ID}" >&2
exit 1
fi
vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
vault_role="${VAULT_ROLE:-sso-secrets}"
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
vault_token="$(curl -sS --request POST --data "${login_payload}" \
"${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
echo "vault login failed" >&2
exit 1
fi
payload="$(jq -nc \
--arg client_id "${CLIENT_ID}" \
--arg client_secret "${CLIENT_SECRET}" \
--arg issuer "https://sso.bstein.dev/realms/${REALM}" \
--arg auto_discovery_url "https://sso.bstein.dev/realms/${REALM}/.well-known/openid-configuration" \
--arg auth_source_name "${AUTH_SOURCE_NAME}" \
--arg tester_group "${TESTER_GROUP}" \
--arg group_team_map "${GROUP_TEAM_MAP}" \
'{data:{client_id:$client_id,client_secret:$client_secret,issuer:$issuer,openid_auto_discovery_url:$auto_discovery_url,auth_source_name:$auth_source_name,required_claim_name:"groups",required_claim_value:$tester_group,group_claim_name:"groups",restricted_group:$tester_group,group_team_map:$group_team_map}}')"
write_status="$(curl -sS -o /tmp/veles-gitea-oidc-write.json -w "%{http_code}" -X POST \
-H "X-Vault-Token: ${vault_token}" \
-H 'Content-Type: application/json' \
-d "${payload}" "${vault_addr}/v1/kv/data/atlas/${VAULT_SECRET_PATH}")"
if [ "${write_status}" != "200" ] && [ "${write_status}" != "204" ]; then
echo "Vault write failed for ${VAULT_SECRET_PATH} (status ${write_status})" >&2
cat /tmp/veles-gitea-oidc-write.json >&2 || true
exit 1
fi
echo "Veles Gitea OIDC client ready in Keycloak and Vault"

View File

@ -20,6 +20,18 @@ spec:
values:
- rpi5
- rpi4
- key: kubernetes.io/hostname
operator: NotIn
values:
- titan-04
- titan-06
- titan-12
- titan-13
- titan-14
- titan-15
- titan-17
- titan-18
- titan-19
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 80
preference:

View File

@ -537,8 +537,6 @@ spec:
$patch: delete
- name: VAULT_ENV_FILE
value: /vault/secrets/mailu-env.sh
- name: MAILU_POSTFIX_DISABLE_POSTLOG_UNIX_DGRAM
value: "true"
volumeMounts:
- name: mailu-vault-entrypoint
mountPath: /entrypoint.sh

View File

@ -31,24 +31,4 @@ if [ -n "${VAULT_COPY_FILES:-}" ]; then
IFS="$old_ifs"
fi
if [ -d /app/venv/bin ]; then
PATH="/app/venv/bin:$PATH"
export PATH
fi
if [ "${MAILU_POSTFIX_DISABLE_POSTLOG_UNIX_DGRAM:-}" = "true" ] || { [ -x /usr/sbin/postfix ] && [ -x /usr/sbin/postconf ]; }; then
mkdir -p /tmp/mailu-wrapper-bin
cat > /tmp/mailu-wrapper-bin/postfix <<'EOF'
#!/bin/sh
if [ "${1:-}" = "start-fg" ]; then
/usr/sbin/postconf -MX postlog/unix-dgram 2>/dev/null || true
exec /usr/libexec/postfix/master -c /etc/postfix -d
fi
exec /usr/sbin/postfix "$@"
EOF
chmod 0755 /tmp/mailu-wrapper-bin/postfix
PATH="/tmp/mailu-wrapper-bin:$PATH"
export PATH
fi
exec "$@"

View File

@ -52,9 +52,9 @@ resources:
- metis-ingress.yaml
images:
- name: registry.bstein.dev/bstein/ariadne
newTag: 0.1.0-293 # {"$imagepolicy": "maintenance:ariadne:tag"}
newTag: 0.1.0-258 # {"$imagepolicy": "maintenance:ariadne:tag"}
- name: registry.bstein.dev/bstein/metis
newTag: 0.1.0-179-arm64 # {"$imagepolicy": "maintenance:metis-arm64:tag"}
newTag: 0.1.0-145-arm64 # {"$imagepolicy": "maintenance:metis-arm64:tag"}
- name: registry.bstein.dev/bstein/soteria
newTag: 0.1.0-36 # {"$imagepolicy": "maintenance:soteria:tag"}
configMapGenerator:

View File

@ -9,14 +9,14 @@ data:
METIS_INVENTORY_PATH: /app/inventory.titan-rpi4.yaml
METIS_DATA_DIR: /var/lib/metis
METIS_DEFAULT_FLASH_HOST: titan-20
METIS_FLASH_HOSTS: titan-20,titan-21,titan-22,titan-23,titan-24,titan-19,titan-17,titan-15,titan-14,titan-12,titan-11,titan-10,titan-09,titan-08,titan-07,titan-06,titan-05,titan-04,titan-0c,titan-0b,titan-0a
METIS_FLASH_HOSTS: titan-20,titan-21,titan-22,titan-24,titan-19,titan-17,titan-15,titan-14,titan-12,titan-11,titan-10,titan-09,titan-08,titan-07,titan-06,titan-05,titan-04,titan-0c,titan-0b,titan-0a
METIS_LOCAL_HOST: titan-20
METIS_ALLOWED_GROUPS: admin,maintenance
METIS_MAX_DEVICE_BYTES: "1000000000000"
METIS_NAMESPACE: maintenance
METIS_REMOTE_POD_TIMEOUT_SEC: "14400"
METIS_RUNNER_IMAGE_AMD64: registry.bstein.dev/bstein/metis:0.1.0-179-amd64 # {"$imagepolicy": "maintenance:metis-amd64"}
METIS_RUNNER_IMAGE_ARM64: registry.bstein.dev/bstein/metis:0.1.0-179-arm64 # {"$imagepolicy": "maintenance:metis-arm64"}
METIS_RUNNER_IMAGE_AMD64: registry.bstein.dev/bstein/metis:0.1.0-145-amd64 # {"$imagepolicy": "maintenance:metis-amd64"}
METIS_RUNNER_IMAGE_ARM64: registry.bstein.dev/bstein/metis:0.1.0-145-arm64 # {"$imagepolicy": "maintenance:metis-arm64"}
METIS_HARBOR_REGISTRY: registry.bstein.dev
METIS_HARBOR_PROJECT: metis
METIS_HARBOR_API_BASE: https://registry.bstein.dev/api/v2.0

View File

@ -32,7 +32,7 @@ spec:
kubernetes.io/arch: amd64
containers:
- name: metis-sentinel
image: registry.bstein.dev/bstein/metis-sentinel:0.1.0-179-amd64 # {"$imagepolicy": "maintenance:metis-sentinel-amd64"}
image: registry.bstein.dev/bstein/metis-sentinel:0.1.0-145-amd64 # {"$imagepolicy": "maintenance:metis-sentinel-amd64"}
imagePullPolicy: Always
envFrom:
- configMapRef:

View File

@ -32,7 +32,7 @@ spec:
kubernetes.io/arch: arm64
containers:
- name: metis-sentinel
image: registry.bstein.dev/bstein/metis-sentinel:0.1.0-179-arm64 # {"$imagepolicy": "maintenance:metis-sentinel-arm64"}
image: registry.bstein.dev/bstein/metis-sentinel:0.1.0-145-arm64 # {"$imagepolicy": "maintenance:metis-sentinel-arm64"}
imagePullPolicy: Always
envFrom:
- configMapRef:

View File

@ -14,8 +14,6 @@ spec:
metadata:
labels:
app: node-nofile
annotations:
bstein.dev/restarted-at: "2026-06-10T08:40:00Z"
spec:
serviceAccountName: node-nofile
tolerations:

View File

@ -2,7 +2,6 @@
set -euo pipefail
limit_line="LimitNOFILE=1048576"
sysctl_file="/host/etc/sysctl.d/99-atlas-inotify.conf"
changed=0
for unit in k3s k3s-agent; do
@ -18,17 +17,6 @@ for unit in k3s k3s-agent; do
fi
done
mkdir -p "$(dirname "${sysctl_file}")"
cat > "${sysctl_file}" <<'EOF'
fs.inotify.max_user_instances = 8192
fs.inotify.max_user_watches = 524288
fs.inotify.max_queued_events = 32768
EOF
printf "8192" > /host/proc/sys/fs/inotify/max_user_instances
printf "524288" > /host/proc/sys/fs/inotify/max_user_watches
printf "32768" > /host/proc/sys/fs/inotify/max_queued_events
if [ "${changed}" -eq 1 ]; then
sleep "$(( (RANDOM % 300) + 10 ))"
chroot /host /bin/systemctl daemon-reload

View File

@ -2130,7 +2130,7 @@
},
"targets": [
{
"expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) or (sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",suite=~\"ariadne|atlasbot|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=\"supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})))) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) or (sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",suite=~\"ariadne|atlasbot|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=\"supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})))) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube\",status!=\"\"})) or (sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",suite=~\"ariadne|atlasbot|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=\"supply_chain\",status!=\"\"})))) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) or (sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",suite=~\"ariadne|atlasbot|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=\"supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})))) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) or (sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",suite=~\"ariadne|atlasbot|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=\"supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})))) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube\",status!=\"\"})) or (sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",suite=~\"ariadne|atlasbot|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=\"supply_chain\",status!=\"\"})))) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"refId": "A",
"instant": true
}
@ -2216,7 +2216,7 @@
},
"targets": [
{
"expr": "100 * ((sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper|lesavka\",status=~\"ok|passed|success\"}) or on() vector(0))) / clamp_min(((sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper|lesavka\"}) or on() vector(0))), 1)",
"expr": "100 * ((sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper\",status=~\"ok|passed|success\"}) or on() vector(0))) / clamp_min(((sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper\"}) or on() vector(0))), 1)",
"refId": "A",
"instant": true
}
@ -2302,7 +2302,7 @@
},
"targets": [
{
"expr": "(sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper|lesavka\",status!~\"ok|passed|success\"}) or on() vector(0))",
"expr": "(sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper\",status!~\"ok|passed|success\"}) or on() vector(0))",
"refId": "A",
"instant": true
}
@ -2384,7 +2384,7 @@
},
"targets": [
{
"expr": "sum((sum by (suite) (platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\"}) > bool 0)) or on() vector(0)",
"expr": "sum((sum by (suite) (platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\"}) > bool 0)) or on() vector(0)",
"refId": "A",
"instant": true
}
@ -2404,15 +2404,15 @@
},
{
"color": "dark-yellow",
"value": 8
"value": 7
},
{
"color": "dark-green",
"value": 9
"value": 8
},
{
"color": "dark-blue",
"value": 10
"value": 9
}
]
},
@ -2466,7 +2466,7 @@
},
"targets": [
{
"expr": "(avg((max by (suite) (platform_quality:suite_coverage_percent:latest_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\"}))) or on() vector(0))",
"expr": "(avg((max by (suite) (platform_quality:suite_coverage_percent:latest_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\"}))) or on() vector(0))",
"refId": "A",
"instant": true
}
@ -2819,7 +2819,7 @@
},
"targets": [
{
"expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
"expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
"refId": "A",
"legendFormat": "{{category}}",
"format": "time_series",

View File

@ -157,7 +157,7 @@
{
"id": 3,
"type": "stat",
"title": "CI Run Success Rate (7d)",
"title": "CI Run Success Rate (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -170,7 +170,7 @@
},
"targets": [
{
"expr": "100 * ((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[7d:1h])) or on() vector(0))) / clamp_min(((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1h])) or on() vector(0))), 1)",
"expr": "100 * ((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[30d:15m])) or on() vector(0))) / clamp_min(((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[30d:15m])) or on() vector(0))), 1)",
"refId": "A",
"instant": true
}
@ -227,7 +227,7 @@
},
"textMode": "value"
},
"description": "Percent of selected quality-gate CI runs that completed successfully in 7d; higher means more stable automation."
"description": "Percent of selected quality-gate CI runs that completed successfully in 30d; higher means more stable automation."
},
{
"id": 4,
@ -990,7 +990,7 @@
},
"targets": [
{
"expr": "(100 * sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[7d:1h])) / (sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1h])))) and on(suite) ((sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1h]))) > 0)",
"expr": "(100 * sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[7d:1m])) / (sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1m])))) and on(suite) ((sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1m]))) > 0)",
"refId": "A",
"legendFormat": "{{suite}}",
"format": "time_series",
@ -1050,8 +1050,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 153,
@ -1131,7 +1130,6 @@
"sort": "none"
}
},
"timeFrom": "7d",
"links": [
{
"title": "Open Jenkins",
@ -1227,16 +1225,6 @@
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
]
},
@ -1292,7 +1280,6 @@
"mode": "multi"
}
},
"timeFrom": "7d",
"description": "Twenty-four-hour rolling quality-gate run counts for the selected suite/branch scope. This is volume, not a pass-rate percentage."
},
{
@ -1372,8 +1359,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 14,
@ -1452,8 +1438,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
}
],
"description": "Recent CI run, coverage, LOC, and raw test-result trends for selected suites."
@ -1547,8 +1532,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 131,
@ -1627,8 +1611,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 132,
@ -1707,8 +1690,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 133,
@ -1787,8 +1769,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 134,
@ -1867,8 +1848,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 135,
@ -1947,8 +1927,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 136,
@ -2027,8 +2006,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
}
],
"description": "Failure percent by check family; blue is zero failures, warmer colors show blockers."
@ -2122,8 +2100,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 139,
@ -2202,8 +2179,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 140,
@ -2282,8 +2258,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 141,
@ -2362,8 +2337,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 142,
@ -2442,8 +2416,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 143,
@ -2522,8 +2495,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 144,
@ -2602,8 +2574,7 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
}
],
"description": "Healthy percent by check family; blue means all selected checks are good."
@ -2709,7 +2680,6 @@
"sort": "none"
}
},
"timeFrom": "24h",
"links": [
{
"title": "Open Jenkins",
@ -2805,23 +2775,13 @@
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
]
},
{
"id": 147,
"type": "bargauge",
"title": "Most Problematic Test by Suite (7d)",
"title": "Most Problematic Test by Suite (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -2834,7 +2794,7 @@
},
"targets": [
{
"expr": "sort_desc(topk by (suite) (1, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[7d:1h])))))",
"expr": "sort_desc(topk by (suite) (1, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[30d:1h])))))",
"refId": "A",
"legendFormat": "{{suite}} \u00b7 {{test}}",
"instant": true
@ -2994,16 +2954,6 @@
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
],
"transformations": [
@ -3023,7 +2973,7 @@
}
}
],
"description": "Worst test per suite summed across 7d. This catches repeat offenders while keeping dashboard loads bounded; current hourly top list is quiet."
"description": "Worst test per suite summed across 30d. This catches historical repeat offenders even when the current hourly top list is quiet."
},
{
"id": 146,
@ -3189,20 +3139,9 @@
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
],
"description": "Stacked hourly outcome volume for the selected suite/branch/test scope. This uses vmalert rollups only, avoiding expensive raw long-range per-test scans.",
"timeFrom": "24h"
"description": "Stacked hourly outcome volume for the selected suite/branch/test scope. This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans."
},
{
"id": 152,
@ -3294,7 +3233,6 @@
"sort": "none"
}
},
"timeFrom": "24h",
"links": [
{
"title": "Open Jenkins",
@ -3390,16 +3328,6 @@
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
]
}
@ -3434,7 +3362,7 @@
},
"targets": [
{
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\")) and on(suite) count by (suite) ({__name__=~\".*_quality_gate_tests_total\",exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\"))))",
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\")) and on(suite) count by (suite) ({__name__=~\".*_quality_gate_tests_total\",exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\"))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -3517,7 +3445,7 @@
},
"targets": [
{
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\")) and on(suite) count by (suite) ({__name__=~\".*_quality_gate_checks_total\",exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\"))))",
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\")) and on(suite) count by (suite) ({__name__=~\".*_quality_gate_checks_total\",exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\"))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -3600,7 +3528,7 @@
},
"targets": [
{
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_workspace_line_coverage_percent{exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\"))))",
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_workspace_line_coverage_percent{exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\"))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -3683,7 +3611,7 @@
},
"targets": [
{
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_source_lines_over_500_total{exported_job=\"platform-quality-ci\"}) and on(suite) count by (suite) (platform_quality_gate_source_files_total{exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\"))))",
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_source_lines_over_500_total{exported_job=\"platform-quality-ci\"}) and on(suite) count by (suite) (platform_quality_gate_source_files_total{exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\"))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -3766,7 +3694,7 @@
},
"targets": [
{
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_test_case_result{exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\"))))",
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_test_case_result{exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\"))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -3849,7 +3777,7 @@
},
"targets": [
{
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_test_case_result{exported_job=\"platform-quality-ci\",test!=\"__no_test_cases__\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\"))))",
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_test_case_result{exported_job=\"platform-quality-ci\",test!=\"__no_test_cases__\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\"))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -3919,7 +3847,7 @@
{
"id": 150,
"type": "bargauge",
"title": "Primary Branch Clean by Suite (7d)",
"title": "Primary Branch Clean by Suite (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -3932,7 +3860,7 @@
},
"targets": [
{
"expr": "sort((100 * (((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[7d:1h]))) > bool 0) unless on(suite) ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[7d:1h]))) > bool 0))) or on(suite) (0 * ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[7d:1h]))) > bool 0)))",
"expr": "sort((100 * (((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d:15m]))) > bool 0) unless on(suite) ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[30d:15m]))) > bool 0))) or on(suite) (0 * ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d:15m]))) > bool 0)))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -4081,16 +4009,6 @@
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
],
"transformations": [
@ -4109,7 +4027,7 @@
{
"id": 149,
"type": "bargauge",
"title": "Recent Branch Evidence by Suite (7d)",
"title": "Recent Branch Evidence by Suite (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -4122,7 +4040,7 @@
},
"targets": [
{
"expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[7d:1h])))",
"expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d:15m])))",
"refId": "A",
"legendFormat": "{{suite}} \u00b7 {{branch}}",
"instant": true
@ -4259,16 +4177,6 @@
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
],
"transformations": [
@ -4638,15 +4546,14 @@
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
}
],
"description": "SonarQube availability, projects, fetch errors, and gate status."
}
],
"time": {
"from": "now-24h",
"from": "now-30d",
"to": "now"
},
"annotations": {
@ -4666,7 +4573,7 @@
"name": "suite",
"label": "Suite",
"type": "custom",
"query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper,lesavka : lesavka",
"query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper",
"current": {
"text": "All",
"value": "$__all",
@ -4717,17 +4624,12 @@
"text": "data_prepper",
"value": "data_prepper",
"selected": false
},
{
"text": "lesavka",
"value": "lesavka",
"selected": false
}
],
"hide": 0,
"multi": false,
"includeAll": true,
"allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka",
"allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper",
"refresh": 1,
"sort": 1,
"skipUrlSync": false
@ -4781,7 +4683,7 @@
"name": "test",
"label": "Test Case",
"type": "query",
"query": "query_result(topk(75, count by (test) (max_over_time(platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category!~\"fixtures|golden|helpers\"}[24h:1h]))))",
"query": "query_result(topk(250, count by (test) (max_over_time(platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category!~\"fixtures|golden|helpers\"}[$__range]))))",
"regex": "/test=\"([^\"]+)\"/",
"current": {
"text": "All",

View File

@ -38,12 +38,6 @@ spec:
operator: NotIn
values:
- "true"
- key: veles.bstein.dev/node-pool
operator: NotIn
values:
- oceanus
- key: node-role.kubernetes.io/accelerator
operator: Exists
tolerations:
- operator: Exists
containers:

View File

@ -2139,7 +2139,7 @@ data:
},
"targets": [
{
"expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) or (sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",suite=~\"ariadne|atlasbot|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=\"supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})))) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) or (sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",suite=~\"ariadne|atlasbot|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=\"supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})))) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube\",status!=\"\"})) or (sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",suite=~\"ariadne|atlasbot|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=\"supply_chain\",status!=\"\"})))) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"expr": "(avg((min by (suite) (((100 * sum by (suite) (clamp_max(max by (suite, check) ((((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) or (sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",suite=~\"ariadne|atlasbot|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=\"supply_chain\",status=~\"ok|passed|success|not_applicable|skipped|na|n/a\"})))) > 0), 1) unless on(suite, check) (clamp_max(max by (suite, check) ((((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})) or (sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",suite=~\"ariadne|atlasbot|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=\"supply_chain\",status!~\"ok|passed|success|not_applicable|skipped|na|n/a\"})))) > 0), 1))) / clamp_min(sum by (suite) (clamp_max(max by (suite, check) ((((sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=~\"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube\",status!=\"\"})) or (sum by (suite, branch, check, status) (platform_quality:check_status:present_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",suite=~\"ariadne|atlasbot|bstein_home|data_prepper\",branch=~\"main|master|origin/main|origin/master\",check=\"supply_chain\",status!=\"\"})))) > 0), 1)), 1))) or (min by (suite) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|manual|performance|regression|reliability|security|smoke|system|ui|unit\"}))))) or on() vector(0))",
"refId": "A",
"instant": true
}
@ -2225,7 +2225,7 @@ data:
},
"targets": [
{
"expr": "100 * ((sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper|lesavka\",status=~\"ok|passed|success\"}) or on() vector(0))) / clamp_min(((sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper|lesavka\"}) or on() vector(0))), 1)",
"expr": "100 * ((sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper\",status=~\"ok|passed|success\"}) or on() vector(0))) / clamp_min(((sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper\"}) or on() vector(0))), 1)",
"refId": "A",
"instant": true
}
@ -2311,7 +2311,7 @@ data:
},
"targets": [
{
"expr": "(sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper|lesavka\",status!~\"ok|passed|success\"}) or on() vector(0))",
"expr": "(sum(platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|pegasus-health|pegasus_health|soteria|titan_iac|titan-iac|bstein_home|bstein-home|data_prepper|data-prepper\",status!~\"ok|passed|success\"}) or on() vector(0))",
"refId": "A",
"instant": true
}
@ -2393,7 +2393,7 @@ data:
},
"targets": [
{
"expr": "sum((sum by (suite) (platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\"}) > bool 0)) or on() vector(0)",
"expr": "sum((sum by (suite) (platform_quality:suite_runs:increase_24h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\"}) > bool 0)) or on() vector(0)",
"refId": "A",
"instant": true
}
@ -2413,15 +2413,15 @@ data:
},
{
"color": "dark-yellow",
"value": 8
"value": 7
},
{
"color": "dark-green",
"value": 9
"value": 8
},
{
"color": "dark-blue",
"value": 10
"value": 9
}
]
},
@ -2475,7 +2475,7 @@ data:
},
"targets": [
{
"expr": "(avg((max by (suite) (platform_quality:suite_coverage_percent:latest_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\"}))) or on() vector(0))",
"expr": "(avg((max by (suite) (platform_quality:suite_coverage_percent:latest_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\"}))) or on() vector(0))",
"refId": "A",
"instant": true
}
@ -2828,7 +2828,7 @@ data:
},
"targets": [
{
"expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
"expr": "avg by (category) (platform_quality:test_category_health_rate:percent_1h{suite=~\"ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper\",branch!=\"\",branch=~\"main|master|origin/main|origin/master\",category=~\"api|chaos|compatibility|component|contract|e2e|integration|performance|regression|reliability|security|smoke|system|ui\"})",
"refId": "A",
"legendFormat": "{{category}}",
"format": "time_series",

View File

@ -166,7 +166,7 @@ data:
{
"id": 3,
"type": "stat",
"title": "CI Run Success Rate (7d)",
"title": "CI Run Success Rate (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -179,7 +179,7 @@ data:
},
"targets": [
{
"expr": "100 * ((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[7d:1h])) or on() vector(0))) / clamp_min(((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1h])) or on() vector(0))), 1)",
"expr": "100 * ((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[30d:15m])) or on() vector(0))) / clamp_min(((sum(increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[30d:15m])) or on() vector(0))), 1)",
"refId": "A",
"instant": true
}
@ -236,7 +236,7 @@ data:
},
"textMode": "value"
},
"description": "Percent of selected quality-gate CI runs that completed successfully in 7d; higher means more stable automation."
"description": "Percent of selected quality-gate CI runs that completed successfully in 30d; higher means more stable automation."
},
{
"id": 4,
@ -999,7 +999,7 @@ data:
},
"targets": [
{
"expr": "(100 * sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[7d:1h])) / (sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1h])))) and on(suite) ((sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1h]))) > 0)",
"expr": "(100 * sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\",status=~\"ok|passed|success\"}))[7d:1m])) / (sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1m])))) and on(suite) ((sum by (suite) (increase((max without(instance, job) (platform_quality_gate_runs_total{suite=~\"${suite:regex}\",exported_job=\"platform-quality-ci\"}))[7d:1m]))) > 0)",
"refId": "A",
"legendFormat": "{{suite}}",
"format": "time_series",
@ -1059,8 +1059,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 153,
@ -1140,7 +1139,6 @@ data:
"sort": "none"
}
},
"timeFrom": "7d",
"links": [
{
"title": "Open Jenkins",
@ -1236,16 +1234,6 @@ data:
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
]
},
@ -1301,7 +1289,6 @@ data:
"mode": "multi"
}
},
"timeFrom": "7d",
"description": "Twenty-four-hour rolling quality-gate run counts for the selected suite/branch scope. This is volume, not a pass-rate percentage."
},
{
@ -1381,8 +1368,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 14,
@ -1461,8 +1447,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
}
],
"description": "Recent CI run, coverage, LOC, and raw test-result trends for selected suites."
@ -1556,8 +1541,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 131,
@ -1636,8 +1620,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 132,
@ -1716,8 +1699,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 133,
@ -1796,8 +1778,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 134,
@ -1876,8 +1857,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 135,
@ -1956,8 +1936,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 136,
@ -2036,8 +2015,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
}
],
"description": "Failure percent by check family; blue is zero failures, warmer colors show blockers."
@ -2131,8 +2109,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 139,
@ -2211,8 +2188,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 140,
@ -2291,8 +2267,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 141,
@ -2371,8 +2346,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 142,
@ -2451,8 +2425,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 143,
@ -2531,8 +2504,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
},
{
"id": 144,
@ -2611,8 +2583,7 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
}
],
"description": "Healthy percent by check family; blue means all selected checks are good."
@ -2718,7 +2689,6 @@ data:
"sort": "none"
}
},
"timeFrom": "24h",
"links": [
{
"title": "Open Jenkins",
@ -2814,23 +2784,13 @@ data:
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
]
},
{
"id": 147,
"type": "bargauge",
"title": "Most Problematic Test by Suite (7d)",
"title": "Most Problematic Test by Suite (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -2843,7 +2803,7 @@ data:
},
"targets": [
{
"expr": "sort_desc(topk by (suite) (1, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[7d:1h])))))",
"expr": "sort_desc(topk by (suite) (1, (sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",status=\"failed\"}[30d:1h])))))",
"refId": "A",
"legendFormat": "{{suite}} \u00b7 {{test}}",
"instant": true
@ -3003,16 +2963,6 @@ data:
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
],
"transformations": [
@ -3032,7 +2982,7 @@ data:
}
}
],
"description": "Worst test per suite summed across 7d. This catches repeat offenders while keeping dashboard loads bounded; current hourly top list is quiet."
"description": "Worst test per suite summed across 30d. This catches historical repeat offenders even when the current hourly top list is quiet."
},
{
"id": 146,
@ -3198,20 +3148,9 @@ data:
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
],
"description": "Stacked hourly outcome volume for the selected suite/branch/test scope. This uses vmalert rollups only, avoiding expensive raw long-range per-test scans.",
"timeFrom": "24h"
"description": "Stacked hourly outcome volume for the selected suite/branch/test scope. This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans."
},
{
"id": 152,
@ -3303,7 +3242,6 @@ data:
"sort": "none"
}
},
"timeFrom": "24h",
"links": [
{
"title": "Open Jenkins",
@ -3399,16 +3337,6 @@ data:
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
]
}
@ -3443,7 +3371,7 @@ data:
},
"targets": [
{
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\")) and on(suite) count by (suite) ({__name__=~\".*_quality_gate_tests_total\",exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\"))))",
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\")) and on(suite) count by (suite) ({__name__=~\".*_quality_gate_tests_total\",exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\"))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -3526,7 +3454,7 @@ data:
},
"targets": [
{
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\")) and on(suite) count by (suite) ({__name__=~\".*_quality_gate_checks_total\",exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\"))))",
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\")) and on(suite) count by (suite) ({__name__=~\".*_quality_gate_checks_total\",exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\"))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -3609,7 +3537,7 @@ data:
},
"targets": [
{
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_workspace_line_coverage_percent{exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\"))))",
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_workspace_line_coverage_percent{exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\"))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -3692,7 +3620,7 @@ data:
},
"targets": [
{
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_source_lines_over_500_total{exported_job=\"platform-quality-ci\"}) and on(suite) count by (suite) (platform_quality_gate_source_files_total{exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\"))))",
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_source_lines_over_500_total{exported_job=\"platform-quality-ci\"}) and on(suite) count by (suite) (platform_quality_gate_source_files_total{exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\"))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -3775,7 +3703,7 @@ data:
},
"targets": [
{
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_test_case_result{exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\"))))",
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_test_case_result{exported_job=\"platform-quality-ci\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\"))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -3858,7 +3786,7 @@ data:
},
"targets": [
{
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_test_case_result{exported_job=\"platform-quality-ci\",test!=\"__no_test_cases__\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"lesavka\", \"__name__\", \".*\"))))",
"expr": "sort((100 * (((label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\")) and on(suite) count by (suite) (platform_quality_gate_test_case_result{exported_job=\"platform-quality-ci\",test!=\"__no_test_cases__\"})))) or on(suite) (0 * (label_replace(vector(1), \"suite\", \"ariadne\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"metis\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"ananke\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"atlasbot\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"pegasus\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"soteria\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"titan_iac\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"bstein_home\", \"__name__\", \".*\") or label_replace(vector(1), \"suite\", \"data_prepper\", \"__name__\", \".*\"))))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -3928,7 +3856,7 @@ data:
{
"id": 150,
"type": "bargauge",
"title": "Primary Branch Clean by Suite (7d)",
"title": "Primary Branch Clean by Suite (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -3941,7 +3869,7 @@ data:
},
"targets": [
{
"expr": "sort((100 * (((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[7d:1h]))) > bool 0) unless on(suite) ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[7d:1h]))) > bool 0))) or on(suite) (0 * ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[7d:1h]))) > bool 0)))",
"expr": "sort((100 * (((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d:15m]))) > bool 0) unless on(suite) ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\",branch!~\"main|master|origin/main|origin/master|unknown\"}[30d:15m]))) > bool 0))) or on(suite) (0 * ((count by (suite) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d:15m]))) > bool 0)))",
"refId": "A",
"legendFormat": "{{suite}}",
"instant": true
@ -4090,16 +4018,6 @@ data:
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
],
"transformations": [
@ -4118,7 +4036,7 @@ data:
{
"id": 149,
"type": "bargauge",
"title": "Recent Branch Evidence by Suite (7d)",
"title": "Recent Branch Evidence by Suite (30d)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -4131,7 +4049,7 @@ data:
},
"targets": [
{
"expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[7d:1h])))",
"expr": "sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",exported_job=\"platform-quality-ci\"}[30d:15m])))",
"refId": "A",
"legendFormat": "{{suite}} \u00b7 {{branch}}",
"instant": true
@ -4268,16 +4186,6 @@ data:
"title": "data_prepper: Last Artifacts",
"url": "${jenkins_base}/job/data-prepper/lastCompletedBuild/artifact/",
"targetBlank": true
},
{
"title": "lesavka: Job",
"url": "${jenkins_base}/job/lesavka/",
"targetBlank": true
},
{
"title": "lesavka: Last Artifacts",
"url": "${jenkins_base}/job/lesavka/lastCompletedBuild/artifact/",
"targetBlank": true
}
],
"transformations": [
@ -4647,15 +4555,14 @@ data:
"mode": "single",
"sort": "none"
}
},
"timeFrom": "7d"
}
}
],
"description": "SonarQube availability, projects, fetch errors, and gate status."
}
],
"time": {
"from": "now-24h",
"from": "now-30d",
"to": "now"
},
"annotations": {
@ -4675,7 +4582,7 @@ data:
"name": "suite",
"label": "Suite",
"type": "custom",
"query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper,lesavka : lesavka",
"query": "ariadne : ariadne,metis : metis,ananke : ananke,atlasbot : atlasbot,pegasus : pegasus,soteria : soteria,titan_iac : titan_iac,bstein_home : bstein_home,data_prepper : data_prepper",
"current": {
"text": "All",
"value": "$__all",
@ -4726,17 +4633,12 @@ data:
"text": "data_prepper",
"value": "data_prepper",
"selected": false
},
{
"text": "lesavka",
"value": "lesavka",
"selected": false
}
],
"hide": 0,
"multi": false,
"includeAll": true,
"allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper|lesavka",
"allValue": "ariadne|metis|ananke|atlasbot|pegasus|soteria|titan_iac|bstein_home|data_prepper",
"refresh": 1,
"sort": 1,
"skipUrlSync": false
@ -4790,7 +4692,7 @@ data:
"name": "test",
"label": "Test Case",
"type": "query",
"query": "query_result(topk(75, count by (test) (max_over_time(platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category!~\"fixtures|golden|helpers\"}[24h:1h]))))",
"query": "query_result(topk(250, count by (test) (max_over_time(platform_quality:test_case_health_rate:percent_1h{suite=~\"${suite:regex}\",branch!=\"\",branch=~\"${branch:regex}\",test!=\"\",test!=\"__no_test_cases__\",category!~\"fixtures|golden|helpers\"}[$__range]))))",
"regex": "/test=\"([^\"]+)\"/",
"current": {
"text": "All",

View File

@ -50,15 +50,6 @@ spec:
upgrade:
disableWait: true
values:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: veles.bstein.dev/node-pool
operator: NotIn
values:
- oceanus
rbac:
pspEnabled: false
service:
@ -75,8 +66,6 @@ metadata:
namespace: monitoring
spec:
interval: 15m
upgrade:
disableWait: true
chart:
spec:
chart: victoria-metrics-single
@ -95,22 +84,11 @@ spec:
persistentVolume:
enabled: true
size: 100Gi
resources:
requests:
cpu: 250m
memory: 1Gi
limits:
cpu: "1"
memory: 2Gi
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: longhorn-host
operator: In
values:
- "true"
- key: kubernetes.io/hostname
operator: NotIn
values:

View File

@ -68,12 +68,6 @@ spec:
operator: NotIn
values:
- "true"
- key: veles.bstein.dev/node-pool
operator: NotIn
values:
- oceanus
- key: node-role.kubernetes.io/accelerator
operator: Exists
tolerations:
- operator: Exists
containers:

View File

@ -19,7 +19,8 @@ spec:
app: collabora
spec:
nodeSelector:
kubernetes.io/arch: amd64
hardware: rpi5
node-role.kubernetes.io/worker: "true"
containers:
- name: collabora
image: collabora/code@sha256:3c58d0e9bae75e4647467d0c7d91cb66f261d3e814709aed590b5c334a04db26

View File

@ -240,11 +240,6 @@ write_policy_and_role "game-stream" "game-stream" "game-stream-vault" \
"game-stream/*" ""
write_policy_and_role "openclaw" "openclaw" "agent-vault" \
"openclaw/*" ""
write_policy_and_role "veles" "veles" "veles-backend,veles-postgres,veles-vault-sync" \
"veles/* shared/harbor-pull shared/postmark-relay" ""
write_policy_and_role "veles-secrets" "veles" "veles-secrets-ensure" \
"shared/postmark-relay" \
"veles/*"
write_policy_and_role "maintenance" "maintenance" "ariadne,maintenance-vault-sync,metis" \
"maintenance/ariadne-db maintenance/metis-oidc maintenance/soteria-oidc maintenance/metis-ssh-keys maintenance/metis-runtime portal/atlas-portal-db portal/bstein-dev-home-keycloak-admin mailu/mailu-db-secret mailu/mailu-initial-account-secret nextcloud/nextcloud-db nextcloud/nextcloud-admin health/wger-admin finance/firefly-secrets comms/mas-admin-client-runtime comms/atlasbot-credentials-runtime comms/synapse-db comms/synapse-admin vault/vault-oidc-config shared/harbor-pull shared/soteria-restic harbor/harbor-core" "" \
'
@ -271,8 +266,8 @@ write_policy_and_role "vault" "vault" "vault" \
"vault/*" ""
write_policy_and_role "sso-secrets" "sso" "mas-secrets-ensure" \
"shared/keycloak-admin shared/postmark-relay maintenance/metis-ssh-keys" \
"harbor/harbor-oidc vault/vault-oidc-config comms/synapse-oidc logging/oauth2-proxy-logs-oidc finance/actual-oidc maintenance/metis-oidc maintenance/soteria-oidc maintenance/metis-ssh-keys openclaw/agent-oidc veles/veles-oidc gitea/gitea-veles-oidc" \
"shared/keycloak-admin maintenance/metis-ssh-keys" \
"harbor/harbor-oidc vault/vault-oidc-config comms/synapse-oidc logging/oauth2-proxy-logs-oidc finance/actual-oidc maintenance/metis-oidc maintenance/soteria-oidc maintenance/metis-ssh-keys openclaw/agent-oidc" \
'
path "kv/data/atlas/nodes/*" {
capabilities = ["create", "update", "read"]

View File

@ -1,85 +0,0 @@
# Veles Infrastructure Contract
This stack is staged for Flux and intentionally starts the app deployments at `replicas: 0` until images, native OIDC/session support, and smoke gates are ready.
## Cluster Contract
- Namespace: `veles`
- Hostname: `https://veles.bstein.dev`
- Namespace: `veles`; no alternate alpha namespace is used.
- Backend service: `veles-backend.veles.svc.cluster.local:80`
- Frontend service: `veles-frontend.veles.svc.cluster.local:80`
- Postgres service: `veles-postgres.veles.svc.cluster.local:5432`
- Artifact PVC: `veles-artifacts`, mounted at `/data/veles-artifacts`
- Storage classes: `veles-oceanus-db`, `veles-oceanus-artifacts`
- Images:
- `registry.bstein.dev/veles/veles-backend`
- `registry.bstein.dev/veles/veles-frontend`
- `registry.bstein.dev/veles/veles-sim-worker`
- Backend `http` container port: `8796`
- Frontend `http` container port: `8080`
- Backend/frontend deployments remain scaled to `0` until native OIDC/session support, image tags, and smoke gates are ready. Services route to a named `http` target port so Ingress does not depend on numeric container ports.
## Auth Contract
Veles owns authorization in the app. The `veles` Ingress does not use oauth2-proxy or Traefik forward-auth, so no ingress/auth layer should strip OIDC token claims. The app should validate tokens from `https://sso.bstein.dev/realms/veles` and expect stable `sub`, `email`, `preferred_username`, `groups`, and `realm_access.roles` claims. Do not scale Veles for real user traffic until native OIDC login/session flow is implemented and smoke-tested.
The Keycloak realm setup creates both groups and realm roles named `alpha` and `admin`. Members of the `alpha` group receive the `alpha` realm role; members of `admin` receive both `alpha` and `admin`. Built-in/meta strategies can stay universal, while runs and user-created strategies should remain user-scoped in the Veles database.
## Runtime Env
Veles should consume:
- `VELES_PUBLIC_BASE_URL=https://veles.bstein.dev`
- `VELES_OIDC_ISSUER=https://sso.bstein.dev/realms/veles`
- `VELES_OIDC_CLIENT_ID=veles-web`
- `VELES_OIDC_REQUIRED_GROUPS=alpha,admin`
- `VELES_OIDC_GROUPS_CLAIM=groups`
- `VELES_OIDC_ROLES_CLAIM=realm_access.roles`
- `DATABASE_URL` from `kv/data/atlas/veles/veles-db`
- `VELES_SESSION_SECRET` from `kv/data/atlas/veles/app-secrets`
- `VELES_BYOK_ENCRYPTION_KEY` from `kv/data/atlas/veles/app-secrets`
User OpenAI API keys must stay in the Veles database encrypted with `VELES_BYOK_ENCRYPTION_KEY`; do not store per-user BYOK secrets in Vault.
Backend runtime secrets are synced from Vault by `veles-vault` into the generated Kubernetes Secret `veles-runtime-secrets`; no secret values are committed. The backend consumes that secret with `envFrom`.
## Artifact Contract
`veles-artifacts` is an RWO Longhorn PVC mounted into backend pods at `/data/veles-artifacts`. Backend pods own artifact writes and serving. Simulation Jobs should not mount or write directly to this PVC unless they are explicitly scheduled on Oceanus with the Veles toleration and the app has chosen a same-node direct-write model. Queue-mediated upload/copy through the backend remains the safer default until the app contract settles.
Backend, simulation workers, and retention/cleanup workers must run on Oceanus/titan-23 when they need artifact access. Frontend pods must not mount `veles-artifacts`.
## Simulation Jobs
The backend service account can create, watch, and delete Jobs only inside the `veles` namespace. Simulation pods should use service account `veles-sim`, set `automountServiceAccountToken: false`, and use:
```yaml
priorityClassName: veles-sim
nodeSelector:
veles.bstein.dev/simulation: "true"
tolerations:
- key: veles.bstein.dev/simulation
operator: Equal
value: "true"
effect: NoSchedule
```
Retention/cleanup Jobs that touch artifacts should use the same node selector and toleration. If they do not need Kubernetes API access, use `veles-sim`; otherwise keep control-plane actions in the backend/controller and run artifact cleanup through a no-token worker.
## Staged Operator Steps
1. Join `titan-23`/Oceanus to Atlas as a worker.
2. Use Metis with `titan-23` in `METIS_FLASH_HOSTS`; the existing node secret placeholder uses `192.168.22.23`.
3. Confirm the node normalizer applies the Veles labels and taint.
4. Add Oceanus Longhorn disks at paths tagged by the Longhorn tag ensure job.
5. Let Vault policy reconciliation run, then unsuspend `veles-secrets-ensure-2`.
6. Unsuspend `veles-realm-ensure-4` in `services/keycloak` to create the realm/client secret, groups, and roles.
7. Create the Harbor `veles` project or robot access before image automation is enabled in production.
8. Keep backend/frontend scaled to `0` until native OIDC/session support is implemented, image tags exist, and smoke gates pass.
## Assumptions
- `veles-oceanus-artifacts` is RWO for alpha; simulation workers should either run on Oceanus with the backend or stream logs to the backend, which owns writes.
- Longhorn default backup target is `s3://atlas-soteria@us-west-004/` with credential secret `longhorn-backup-b2`; the live `BackupTarget/default` currently reports available. Postgres and artifact volumes have Longhorn recurring snapshot and backup jobs attached by their StorageClasses. This is not a substitute for a tested restore drill.
- The Jenkins job skeleton points at the Veles repo but stays disabled until that repo provides a Jenkinsfile.

View File

@ -1,17 +0,0 @@
# services/veles/artifacts-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: veles-artifacts
namespace: veles
labels:
app.kubernetes.io/name: veles
app.kubernetes.io/component: artifacts
veles.bstein.dev/backup: longhorn
spec:
accessModes:
- ReadWriteOnce
storageClassName: veles-oceanus-artifacts
resources:
requests:
storage: 200Gi

View File

@ -1,107 +0,0 @@
# services/veles/backend-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: veles-backend
namespace: veles
labels:
app: veles-backend
spec:
replicas: 1
revisionHistoryLimit: 2
selector:
matchLabels:
app: veles-backend
template:
metadata:
labels:
app: veles-backend
spec:
serviceAccountName: veles-backend
priorityClassName: veles-core
nodeSelector:
veles.bstein.dev/node-pool: oceanus
tolerations:
- key: veles.bstein.dev/simulation
operator: Equal
value: "true"
effect: NoSchedule
securityContext:
fsGroup: 10001
fsGroupChangePolicy: OnRootMismatch
seccompProfile:
type: RuntimeDefault
initContainers:
- name: migrate-db
image: registry.bstein.dev/veles/veles-backend:0.1.6 # {"$imagepolicy": "veles:veles-backend"}
imagePullPolicy: IfNotPresent
command: ["veles-db"]
args: ["--init", "--json"]
envFrom:
- configMapRef:
name: veles-app-config
- secretRef:
name: veles-runtime-secrets
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
securityContext:
runAsNonRoot: true
runAsUser: 10001
runAsGroup: 10001
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
containers:
- name: backend
image: registry.bstein.dev/veles/veles-backend:0.1.6 # {"$imagepolicy": "veles:veles-backend"}
imagePullPolicy: IfNotPresent
env:
- name: VELES_SIM_IMAGE
value: registry.bstein.dev/veles/veles-sim-worker:0.1.6 # {"$imagepolicy": "veles:veles-sim-worker"}
ports:
- name: http
containerPort: 8796
protocol: TCP
readinessProbe:
httpGet:
path: /api/v1/ready
port: http
initialDelaySeconds: 5
periodSeconds: 10
livenessProbe:
httpGet:
path: /api/v1/live
port: http
initialDelaySeconds: 20
periodSeconds: 20
envFrom:
- configMapRef:
name: veles-app-config
- secretRef:
name: veles-runtime-secrets
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: "1"
memory: 2Gi
securityContext:
runAsNonRoot: true
runAsUser: 10001
runAsGroup: 10001
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: artifacts
mountPath: /data/veles-artifacts
volumes:
- name: artifacts
persistentVolumeClaim:
claimName: veles-artifacts

View File

@ -1,56 +0,0 @@
# services/veles/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: veles-app-config
namespace: veles
data:
VELES_ENV: alpha
VELES_PROFILE: cluster
VELES_AUTH_MODE: oidc
VELES_SIM_RUNNER: kubernetes-job
VELES_PUBLIC_URL: https://veles.bstein.dev
VELES_PUBLIC_BASE_URL: https://veles.bstein.dev
VELES_BACKEND_HTTP_PORT: "8796"
VELES_FRONTEND_HTTP_PORT: "8080"
VELES_OIDC_ISSUER_URL: https://sso.bstein.dev/realms/veles
VELES_OIDC_ISSUER: https://sso.bstein.dev/realms/veles
VELES_OIDC_CLIENT_ID: veles-web
VELES_OIDC_ALLOWED_GROUPS: alpha,admin
VELES_OIDC_REQUIRED_GROUPS: alpha,admin
VELES_OIDC_ADMIN_GROUPS: admin
VELES_OIDC_GROUPS_CLAIM: groups
VELES_OIDC_ROLES_CLAIM: realm_access.roles
VELES_DATABASE_HOST: veles-postgres.veles.svc.cluster.local
VELES_DATABASE_PORT: "5432"
VELES_DATABASE_NAME: veles
VELES_ARTIFACTS_PATH: /data/veles-artifacts
VELES_ARTIFACTS_MODE: rwo-backend-owned
VELES_LOG_ROOT: /data/veles-artifacts/logs
VELES_REPORT_ROOT: /data/veles-artifacts/reports
VELES_ARTIFACT_ROOT: /data/veles-artifacts/artifacts
VELES_RETENTION_DAYS: "30"
VELES_SIM_NAMESPACE: veles
VELES_NAMESPACE: veles
VELES_SIM_IMAGE: registry.bstein.dev/veles/veles-sim-worker:0.1.6 # {"$imagepolicy": "veles:veles-sim-worker"}
VELES_SIM_SERVICE_ACCOUNT: veles-sim
VELES_SIM_PRIORITY_CLASS: veles-sim
VELES_SIM_NODE_SELECTOR: veles.bstein.dev/node-pool=oceanus,kubernetes.io/arch=amd64
VELES_SIM_TOLERATIONS: veles.bstein.dev/simulation=true:NoSchedule
VELES_SIM_TOLERATION_KEY: veles.bstein.dev/simulation
VELES_SIM_TOLERATION_VALUE: "true"
VELES_SIM_ACTIVE_DEADLINE_SECONDS: "7200"
VELES_SIM_TTL_SECONDS: "3600"
VELES_SIM_CPU_REQUEST: 500m
VELES_SIM_CPU_LIMIT: "2"
VELES_SIM_MEMORY_REQUEST: 1Gi
VELES_SIM_MEMORY_LIMIT: 4Gi
VELES_SIM_ARTIFACT_PVC: veles-artifacts
VELES_SIM_ARTIFACT_MOUNT_PATH: /data/veles-artifacts
VELES_SIM_FS_GROUP: "10001"
VELES_MAX_ACTIVE_SIMS_PER_USER: "1"
VELES_MAX_ACTIVE_SIMS_GLOBAL: "4"
VELES_RETENTION_NODE_SELECTOR: veles.bstein.dev/simulation=true
VELES_RETENTION_TOLERATION_KEY: veles.bstein.dev/simulation
VELES_RETENTION_TOLERATION_VALUE: "true"
VELES_LOG_RETENTION_DAYS: "30"

View File

@ -1,85 +0,0 @@
# services/veles/frontend-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: veles-frontend
namespace: veles
labels:
app: veles-frontend
spec:
replicas: 2
revisionHistoryLimit: 2
selector:
matchLabels:
app: veles-frontend
template:
metadata:
labels:
app: veles-frontend
spec:
serviceAccountName: veles-frontend
priorityClassName: veles-core
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
- key: hardware
operator: In
values: ["rpi5", "rpi4", "amd64"]
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: atlas.bstein.dev/spillover
operator: DoesNotExist
- weight: 90
preference:
matchExpressions:
- key: hardware
operator: In
values: ["rpi5"]
securityContext:
fsGroup: 101
fsGroupChangePolicy: OnRootMismatch
seccompProfile:
type: RuntimeDefault
containers:
- name: frontend
image: registry.bstein.dev/veles/veles-frontend:0.1.6 # {"$imagepolicy": "veles:veles-frontend"}
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 8080
protocol: TCP
readinessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 3
periodSeconds: 10
livenessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 20
periodSeconds: 20
envFrom:
- configMapRef:
name: veles-app-config
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
securityContext:
runAsNonRoot: true
runAsUser: 101
runAsGroup: 101
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]

View File

@ -1,78 +0,0 @@
# services/veles/image.yaml
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageRepository
metadata:
name: veles-backend
namespace: veles
spec:
image: registry.bstein.dev/veles/veles-backend
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImagePolicy
metadata:
name: veles-backend
namespace: veles
spec:
imageRepositoryRef:
name: veles-backend
filterTags:
pattern: '^(?P<version>\d+\.\d+\.\d+)$'
extract: '$version'
policy:
semver:
range: ">=0.1.0"
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageRepository
metadata:
name: veles-frontend
namespace: veles
spec:
image: registry.bstein.dev/veles/veles-frontend
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImagePolicy
metadata:
name: veles-frontend
namespace: veles
spec:
imageRepositoryRef:
name: veles-frontend
filterTags:
pattern: '^(?P<version>\d+\.\d+\.\d+)$'
extract: '$version'
policy:
semver:
range: ">=0.1.0"
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImageRepository
metadata:
name: veles-sim-worker
namespace: veles
spec:
image: registry.bstein.dev/veles/veles-sim-worker
interval: 1m0s
secretRef:
name: harbor-regcred
---
apiVersion: image.toolkit.fluxcd.io/v1
kind: ImagePolicy
metadata:
name: veles-sim-worker
namespace: veles
spec:
imageRepositoryRef:
name: veles-sim-worker
filterTags:
pattern: '^(?P<version>\d+\.\d+\.\d+)$'
extract: '$version'
policy:
semver:
range: ">=0.1.0"

View File

@ -1,47 +0,0 @@
# services/veles/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: veles
namespace: veles
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
spec:
ingressClassName: traefik
tls:
- hosts: ["veles.bstein.dev"]
secretName: veles-tls
rules:
- host: veles.bstein.dev
http:
paths:
- path: /api
pathType: Prefix
backend:
service:
name: veles-backend
port:
number: 80
- path: /events
pathType: Prefix
backend:
service:
name: veles-backend
port:
number: 80
- path: /ws
pathType: Prefix
backend:
service:
name: veles-backend
port:
number: 80
- path: /
pathType: Prefix
backend:
service:
name: veles-frontend
port:
number: 80

View File

@ -1,22 +0,0 @@
# services/veles/kustomization.yaml
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: veles
resources:
- namespace.yaml
- serviceaccounts.yaml
- secretproviderclass.yaml
- vault-sync-deployment.yaml
- resourcequota.yaml
- limitrange.yaml
- configmap.yaml
- rbac.yaml
- artifacts-pvc.yaml
- postgres-service.yaml
- postgres-statefulset.yaml
- services.yaml
- backend-deployment.yaml
- frontend-deployment.yaml
- image.yaml
- ingress.yaml
- oneoffs/veles-secrets-ensure-job.yaml

View File

@ -1,21 +0,0 @@
# services/veles/limitrange.yaml
apiVersion: v1
kind: LimitRange
metadata:
name: veles-container-limits
namespace: veles
spec:
limits:
- type: Container
defaultRequest:
cpu: 100m
memory: 256Mi
default:
cpu: 500m
memory: 512Mi
min:
cpu: 10m
memory: 32Mi
max:
cpu: "16"
memory: 32Gi

View File

@ -1,8 +0,0 @@
# services/veles/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: veles
labels:
app.kubernetes.io/name: veles
app.kubernetes.io/part-of: veles

View File

@ -1,142 +0,0 @@
# services/veles/oneoffs/veles-secrets-ensure-job.yaml
# One-off job for veles/veles-secrets-ensure-2.
# Purpose: seed Veles Vault paths before app/Postgres pods are scaled up.
# Keep suspended until the veles Vault role has reconciled, then unsuspend once.
apiVersion: batch/v1
kind: Job
metadata:
name: veles-secrets-ensure-2
namespace: veles
spec:
suspend: true
backoffLimit: 0
ttlSecondsAfterFinished: 3600
template:
spec:
serviceAccountName: veles-secrets-ensure
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/worker
operator: Exists
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
containers:
- name: apply
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/bin/bash", "-c"]
args:
- |
set -euo pipefail
vault_addr="${VAULT_ADDR:-http://vault.vault.svc.cluster.local:8200}"
vault_role="${VAULT_ROLE:-veles-secrets}"
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
login_payload="$(jq -nc --arg jwt "${jwt}" --arg role "${vault_role}" '{jwt:$jwt, role:$role}')"
vault_token="$(curl -sS --request POST --data "${login_payload}" \
"${vault_addr}/v1/auth/kubernetes/login" | jq -r '.auth.client_token')"
if [ -z "${vault_token}" ] || [ "${vault_token}" = "null" ]; then
echo "vault login failed" >&2
exit 1
fi
read_secret() {
path="$1"
out="$2"
curl -sS -o "${out}" -w "%{http_code}" \
-H "X-Vault-Token: ${vault_token}" \
"${vault_addr}/v1/kv/data/atlas/${path}" || true
}
write_secret() {
path="$1"
payload="$2"
out="$(mktemp)"
status="$(curl -sS -o "${out}" -w "%{http_code}" -X POST \
-H "X-Vault-Token: ${vault_token}" \
-H "Content-Type: application/json" \
-d "${payload}" \
"${vault_addr}/v1/kv/data/atlas/${path}")"
if [ "${status}" != "200" ] && [ "${status}" != "204" ]; then
echo "Vault write failed for ${path} (status ${status})" >&2
cat "${out}" >&2 || true
exit 1
fi
}
rand_b64() {
bytes="$1"
openssl rand -base64 "${bytes}" | tr -d '\n'
}
status="$(read_secret veles/veles-db /tmp/veles-db.json)"
if [ "${status}" = "200" ]; then
db_password="$(jq -r '.data.data.POSTGRES_PASSWORD // empty' /tmp/veles-db.json)"
elif [ "${status}" = "404" ]; then
db_password=""
else
echo "Vault read failed for veles-db (status ${status})" >&2
cat /tmp/veles-db.json >&2 || true
exit 1
fi
if [ -z "${db_password}" ]; then
db_password="$(rand_b64 36)"
fi
db_payload="$(jq -nc \
--arg host "veles-postgres.veles.svc.cluster.local" \
--arg port "5432" \
--arg db "veles" \
--arg user "veles" \
--arg password "${db_password}" \
'{data:{POSTGRES_HOST:$host,POSTGRES_PORT:$port,POSTGRES_DB:$db,POSTGRES_USER:$user,POSTGRES_PASSWORD:$password,DATABASE_URL:("postgresql://"+$user+":"+$password+"@"+$host+":"+$port+"/"+$db+"?sslmode=disable")}}')"
write_secret veles/veles-db "${db_payload}"
status="$(read_secret veles/app-secrets /tmp/app-secrets.json)"
if [ "${status}" = "200" ]; then
session_secret="$(jq -r '.data.data.VELES_SESSION_SECRET // empty' /tmp/app-secrets.json)"
byok_key="$(jq -r '.data.data.VELES_BYOK_ENCRYPTION_KEY // empty' /tmp/app-secrets.json)"
elif [ "${status}" = "404" ]; then
session_secret=""
byok_key=""
else
echo "Vault read failed for app-secrets (status ${status})" >&2
cat /tmp/app-secrets.json >&2 || true
exit 1
fi
if [ -z "${session_secret}" ]; then
session_secret="$(rand_b64 48)"
fi
if [ -z "${byok_key}" ]; then
byok_key="$(rand_b64 32)"
fi
app_payload="$(jq -nc \
--arg session_secret "${session_secret}" \
--arg byok_key "${byok_key}" \
'{data:{VELES_SESSION_SECRET:$session_secret,VELES_BYOK_ENCRYPTION_KEY:$byok_key}}')"
write_secret veles/app-secrets "${app_payload}"
postmark_status="$(read_secret shared/postmark-relay /tmp/postmark.json)"
if [ "${postmark_status}" = "200" ]; then
smtp_password="$(jq -r '.data.data.apikey // empty' /tmp/postmark.json)"
if [ -n "${smtp_password}" ]; then
smtp_payload="$(jq -nc \
--arg host "mail.bstein.dev" \
--arg port "587" \
--arg user "${smtp_password}" \
--arg password "${smtp_password}" \
--arg from "no-reply-veles@bstein.dev" \
--arg from_name "Veles" \
'{data:{SMTP_HOST:$host,SMTP_PORT:$port,SMTP_USER:$user,SMTP_PASSWORD:$password,SMTP_FROM:$from,SMTP_FROM_NAME:$from_name,SMTP_STARTTLS:"true"}}')"
write_secret veles/smtp "${smtp_payload}"
fi
fi
echo "Veles Vault paths ready: veles-db, app-secrets, smtp when Postmark relay exists"

View File

@ -1,17 +0,0 @@
# services/veles/postgres-service.yaml
apiVersion: v1
kind: Service
metadata:
name: veles-postgres
namespace: veles
labels:
app: veles-postgres
spec:
clusterIP: None
ports:
- name: postgres
port: 5432
protocol: TCP
targetPort: 5432
selector:
app: veles-postgres

View File

@ -1,86 +0,0 @@
# services/veles/postgres-statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: veles-postgres
namespace: veles
labels:
app: veles-postgres
spec:
serviceName: veles-postgres
replicas: 1
selector:
matchLabels:
app: veles-postgres
persistentVolumeClaimRetentionPolicy:
whenDeleted: Retain
whenScaled: Retain
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
app: veles-postgres
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/agent-pre-populate-only: "true"
vault.hashicorp.com/role: "veles"
vault.hashicorp.com/agent-inject-secret-postgres-password: "kv/data/atlas/veles/veles-db"
vault.hashicorp.com/agent-inject-template-postgres-password: |
{{- with secret "kv/data/atlas/veles/veles-db" -}}
{{ .Data.data.POSTGRES_PASSWORD }}
{{- end -}}
spec:
serviceAccountName: veles-postgres
priorityClassName: veles-core
nodeSelector:
veles.bstein.dev/node-pool: oceanus
tolerations:
- key: veles.bstein.dev/simulation
operator: Equal
value: "true"
effect: NoSchedule
securityContext:
fsGroup: 999
seccompProfile:
type: RuntimeDefault
containers:
- name: postgres
image: postgres:15
ports:
- name: postgres
containerPort: 5432
protocol: TCP
env:
- name: PGDATA
value: /var/lib/postgresql/data/pgdata
- name: POSTGRES_USER
value: veles
- name: POSTGRES_PASSWORD_FILE
value: /vault/secrets/postgres-password
- name: POSTGRES_DB
value: veles
resources:
requests:
cpu: "2"
memory: 8Gi
limits:
cpu: "4"
memory: 16Gi
securityContext:
allowPrivilegeEscalation: false
volumeMounts:
- name: postgres-data
mountPath: /var/lib/postgresql/data
volumeClaimTemplates:
- metadata:
name: postgres-data
labels:
app: veles-postgres
veles.bstein.dev/backup: longhorn
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: veles-oceanus-db
resources:
requests:
storage: 100Gi

View File

@ -1,36 +0,0 @@
# services/veles/rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: veles-backend-jobs
namespace: veles
rules:
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["create", "delete", "deletecollection", "get", "list", "patch", "watch"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["delete", "get", "list", "watch"]
- apiGroups: [""]
resources: ["pods/log"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["events"]
verbs: ["get", "list", "watch"]
- apiGroups: ["events.k8s.io"]
resources: ["events"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: veles-backend-jobs
namespace: veles
subjects:
- kind: ServiceAccount
name: veles-backend
namespace: veles
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: veles-backend-jobs

View File

@ -1,54 +0,0 @@
# services/veles/resourcequota.yaml
apiVersion: v1
kind: ResourceQuota
metadata:
name: veles-namespace-quota
namespace: veles
spec:
hard:
requests.cpu: "12"
requests.memory: 24Gi
limits.cpu: "40"
limits.memory: 96Gi
pods: "60"
count/jobs.batch: "100"
persistentvolumeclaims: "8"
requests.storage: 300Gi
---
apiVersion: v1
kind: ResourceQuota
metadata:
name: veles-core-quota
namespace: veles
spec:
hard:
requests.cpu: "4"
requests.memory: 12Gi
limits.cpu: "8"
limits.memory: 24Gi
pods: "12"
scopeSelector:
matchExpressions:
- scopeName: PriorityClass
operator: In
values:
- veles-core
---
apiVersion: v1
kind: ResourceQuota
metadata:
name: veles-sim-quota
namespace: veles
spec:
hard:
requests.cpu: "8"
requests.memory: 16Gi
limits.cpu: "32"
limits.memory: 72Gi
pods: "48"
scopeSelector:
matchExpressions:
- scopeName: PriorityClass
operator: In
values:
- veles-sim

View File

@ -1,54 +0,0 @@
# services/veles/secretproviderclass.yaml
apiVersion: secrets-store.csi.x-k8s.io/v1
kind: SecretProviderClass
metadata:
name: veles-vault
namespace: veles
spec:
provider: vault
parameters:
vaultAddress: "http://vault.vault.svc.cluster.local:8200"
roleName: "veles"
objects: |
- objectName: "harbor-pull__dockerconfigjson"
secretPath: "kv/data/atlas/shared/harbor-pull"
secretKey: "dockerconfigjson"
- objectName: "veles-db__DATABASE_URL"
secretPath: "kv/data/atlas/veles/veles-db"
secretKey: "DATABASE_URL"
- objectName: "veles-db__POSTGRES_USER"
secretPath: "kv/data/atlas/veles/veles-db"
secretKey: "POSTGRES_USER"
- objectName: "veles-db__POSTGRES_PASSWORD"
secretPath: "kv/data/atlas/veles/veles-db"
secretKey: "POSTGRES_PASSWORD"
- objectName: "veles-oidc__client_secret"
secretPath: "kv/data/atlas/veles/veles-oidc"
secretKey: "client_secret"
- objectName: "veles-app-secrets__VELES_SESSION_SECRET"
secretPath: "kv/data/atlas/veles/app-secrets"
secretKey: "VELES_SESSION_SECRET"
- objectName: "veles-app-secrets__VELES_BYOK_ENCRYPTION_KEY"
secretPath: "kv/data/atlas/veles/app-secrets"
secretKey: "VELES_BYOK_ENCRYPTION_KEY"
secretObjects:
- secretName: harbor-regcred
type: kubernetes.io/dockerconfigjson
data:
- objectName: harbor-pull__dockerconfigjson
key: .dockerconfigjson
- secretName: veles-runtime-secrets
type: Opaque
data:
- objectName: veles-db__DATABASE_URL
key: DATABASE_URL
- objectName: veles-db__POSTGRES_USER
key: VELES_DATABASE_USER
- objectName: veles-db__POSTGRES_PASSWORD
key: VELES_DATABASE_PASSWORD
- objectName: veles-oidc__client_secret
key: VELES_OIDC_CLIENT_SECRET
- objectName: veles-app-secrets__VELES_SESSION_SECRET
key: VELES_SESSION_SECRET
- objectName: veles-app-secrets__VELES_BYOK_ENCRYPTION_KEY
key: VELES_BYOK_ENCRYPTION_KEY

View File

@ -1,45 +0,0 @@
# services/veles/serviceaccounts.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: veles-backend
namespace: veles
imagePullSecrets:
- name: harbor-regcred
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: veles-frontend
namespace: veles
imagePullSecrets:
- name: harbor-regcred
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: veles-postgres
namespace: veles
imagePullSecrets:
- name: harbor-regcred
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: veles-vault-sync
namespace: veles
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: veles-secrets-ensure
namespace: veles
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: veles-sim
namespace: veles
automountServiceAccountToken: false
imagePullSecrets:
- name: harbor-regcred

View File

@ -1,32 +0,0 @@
# services/veles/services.yaml
apiVersion: v1
kind: Service
metadata:
name: veles-backend
namespace: veles
labels:
app: veles-backend
spec:
ports:
- name: http
port: 80
protocol: TCP
targetPort: http
selector:
app: veles-backend
---
apiVersion: v1
kind: Service
metadata:
name: veles-frontend
namespace: veles
labels:
app: veles-frontend
spec:
ports:
- name: http
port: 80
protocol: TCP
targetPort: http
selector:
app: veles-frontend

View File

@ -1,43 +0,0 @@
# services/veles/vault-sync-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: veles-vault-sync
namespace: veles
labels:
app: veles-vault-sync
spec:
replicas: 1
selector:
matchLabels:
app: veles-vault-sync
template:
metadata:
labels:
app: veles-vault-sync
spec:
serviceAccountName: veles-vault-sync
containers:
- name: sync
image: alpine:3.20
command: ["/bin/sh", "-c"]
args:
- "sleep infinity"
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 50m
memory: 64Mi
volumeMounts:
- name: vault-secrets
mountPath: /vault/secrets
readOnly: true
volumes:
- name: vault-secrets
csi:
driver: secrets-store.csi.k8s.io
readOnly: true
volumeAttributes:
secretProviderClass: veles-vault