Compare commits
No commits in common. "1a83316a27e4efa66e2dd2fc4c68c2f02c99b10d" and "6d5ecda4d674bc3915335112d1cfd96c2fb270bd" have entirely different histories.
1a83316a27
...
6d5ecda4d6
@ -419,17 +419,16 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
|
|||||||
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
|
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
|
||||||
)
|
)
|
||||||
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
|
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
|
||||||
TEST_REPO_SELECTOR = 'repo=~"ariadne|metis"'
|
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
|
||||||
TEST_CI_COVERAGE = f'ariadne_ci_coverage_percent{{{TEST_REPO_SELECTOR}}}'
|
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
|
||||||
TEST_CI_TESTS = f'ariadne_ci_tests_total{{{TEST_REPO_SELECTOR}}}'
|
ARIADNE_TEST_SUCCESS_RATE = (
|
||||||
TEST_SUCCESS_RATE = (
|
|
||||||
"100 * "
|
"100 * "
|
||||||
f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result="passed"}}[30d])) '
|
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
|
||||||
"/ clamp_min("
|
"/ clamp_min("
|
||||||
f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"passed|failed|error"}}[30d])), 1)'
|
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
|
||||||
)
|
)
|
||||||
TEST_FAILURES_24H = (
|
ARIADNE_TEST_FAILURES_24H = (
|
||||||
f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))'
|
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
|
||||||
)
|
)
|
||||||
POSTGRES_CONN_USED = (
|
POSTGRES_CONN_USED = (
|
||||||
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
|
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
|
||||||
@ -1291,53 +1290,48 @@ def build_overview():
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
test_success = timeseries_panel(
|
panels.append(
|
||||||
42,
|
timeseries_panel(
|
||||||
"Platform Test Success Rate",
|
42,
|
||||||
TEST_SUCCESS_RATE,
|
"Ariadne Test Success Rate",
|
||||||
{"h": 6, "w": 6, "x": 12, "y": 14},
|
ARIADNE_TEST_SUCCESS_RATE,
|
||||||
unit="percent",
|
{"h": 6, "w": 6, "x": 12, "y": 14},
|
||||||
max_value=100,
|
unit="percent",
|
||||||
legend=None,
|
max_value=100,
|
||||||
legend_display="list",
|
legend=None,
|
||||||
|
legend_display="list",
|
||||||
|
)
|
||||||
)
|
)
|
||||||
test_success["description"] = (
|
panels.append(
|
||||||
"Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. "
|
bargauge_panel(
|
||||||
"Add new test series there first so they roll up here."
|
43,
|
||||||
)
|
"Tests with Failures (24h)",
|
||||||
panels.append(test_success)
|
ARIADNE_TEST_FAILURES_24H,
|
||||||
test_failures = bargauge_panel(
|
{"h": 6, "w": 6, "x": 18, "y": 14},
|
||||||
43,
|
unit="none",
|
||||||
"Platform Tests with Failures (24h)",
|
instant=True,
|
||||||
TEST_FAILURES_24H,
|
legend="{{result}}",
|
||||||
{"h": 6, "w": 6, "x": 18, "y": 14},
|
overrides=[
|
||||||
unit="none",
|
{
|
||||||
instant=True,
|
"matcher": {"id": "byName", "options": "error"},
|
||||||
legend="{{result}}",
|
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}],
|
||||||
overrides=[
|
},
|
||||||
{
|
{
|
||||||
"matcher": {"id": "byName", "options": "error"},
|
"matcher": {"id": "byName", "options": "failed"},
|
||||||
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}],
|
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}],
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"matcher": {"id": "byName", "options": "failed"},
|
|
||||||
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
thresholds={
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "green", "value": None},
|
|
||||||
{"color": "yellow", "value": 1},
|
|
||||||
{"color": "orange", "value": 5},
|
|
||||||
{"color": "red", "value": 10},
|
|
||||||
],
|
],
|
||||||
},
|
thresholds={
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{"color": "green", "value": None},
|
||||||
|
{"color": "yellow", "value": 1},
|
||||||
|
{"color": "orange", "value": 5},
|
||||||
|
{"color": "red", "value": 10},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
test_failures["description"] = (
|
|
||||||
"This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
|
|
||||||
)
|
|
||||||
panels.append(test_failures)
|
|
||||||
|
|
||||||
cpu_scope = "$namespace_scope_cpu"
|
cpu_scope = "$namespace_scope_cpu"
|
||||||
gpu_scope = "$namespace_scope_gpu"
|
gpu_scope = "$namespace_scope_gpu"
|
||||||
@ -2655,31 +2649,29 @@ def build_jobs_dashboard():
|
|||||||
legend="{{status}}",
|
legend="{{status}}",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
coverage_panel = stat_panel(
|
panels.append(
|
||||||
17,
|
stat_panel(
|
||||||
"Platform CI Coverage (%)",
|
17,
|
||||||
TEST_CI_COVERAGE,
|
"Ariadne CI Coverage (%)",
|
||||||
{"h": 6, "w": 4, "x": 8, "y": 11},
|
ARIADNE_CI_COVERAGE,
|
||||||
unit="percent",
|
{"h": 6, "w": 4, "x": 8, "y": 11},
|
||||||
decimals=1,
|
unit="percent",
|
||||||
instant=True,
|
decimals=1,
|
||||||
legend="{{branch}}",
|
instant=True,
|
||||||
|
legend="{{branch}}",
|
||||||
|
)
|
||||||
)
|
)
|
||||||
coverage_panel["description"] = "Internal source panel for Atlas Overview automation test rollups."
|
panels.append(
|
||||||
panels.append(coverage_panel)
|
table_panel(
|
||||||
tests_panel = table_panel(
|
18,
|
||||||
18,
|
"Ariadne CI Tests (latest)",
|
||||||
"Platform CI Tests (latest)",
|
ARIADNE_CI_TESTS,
|
||||||
TEST_CI_TESTS,
|
{"h": 6, "w": 12, "x": 12, "y": 11},
|
||||||
{"h": 6, "w": 12, "x": 12, "y": 11},
|
unit="none",
|
||||||
unit="none",
|
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
|
||||||
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],
|
instant=True,
|
||||||
instant=True,
|
)
|
||||||
)
|
)
|
||||||
tests_panel["description"] = (
|
|
||||||
"Atlas Overview test panels depend on these internal repo-tagged CI series."
|
|
||||||
)
|
|
||||||
panels.append(tests_panel)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"uid": "atlas-jobs",
|
"uid": "atlas-jobs",
|
||||||
|
|||||||
@ -422,7 +422,8 @@ spec:
|
|||||||
- $patch: replace
|
- $patch: replace
|
||||||
- name: VAULT_ENV_FILE
|
- name: VAULT_ENV_FILE
|
||||||
value: /vault/secrets/harbor-jobservice-env.sh
|
value: /vault/secrets/harbor-jobservice-env.sh
|
||||||
envFrom: []
|
envFrom:
|
||||||
|
- $patch: replace
|
||||||
- configMapRef:
|
- configMapRef:
|
||||||
name: harbor-jobservice-env
|
name: harbor-jobservice-env
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
|
|||||||
@ -339,12 +339,6 @@ spec:
|
|||||||
value: "1099511627776"
|
value: "1099511627776"
|
||||||
- name: OPENSEARCH_INDEX_PATTERNS
|
- name: OPENSEARCH_INDEX_PATTERNS
|
||||||
value: kube-*,journald-*,trace-analytics-*
|
value: kube-*,journald-*,trace-analytics-*
|
||||||
- name: METIS_BASE_URL
|
|
||||||
value: http://metis.maintenance.svc.cluster.local
|
|
||||||
- name: METIS_TIMEOUT_SEC
|
|
||||||
value: "15"
|
|
||||||
- name: ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH
|
|
||||||
value: "*/30 * * * *"
|
|
||||||
- name: METRICS_PATH
|
- name: METRICS_PATH
|
||||||
value: "/metrics"
|
value: "/metrics"
|
||||||
resources:
|
resources:
|
||||||
|
|||||||
@ -21,72 +21,3 @@ spec:
|
|||||||
policy:
|
policy:
|
||||||
semver:
|
semver:
|
||||||
range: ">=0.1.0-0"
|
range: ">=0.1.0-0"
|
||||||
---
|
|
||||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
|
||||||
kind: ImageRepository
|
|
||||||
metadata:
|
|
||||||
name: metis
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
image: registry.bstein.dev/bstein/metis
|
|
||||||
interval: 1m0s
|
|
||||||
secretRef:
|
|
||||||
name: harbor-regcred
|
|
||||||
---
|
|
||||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
|
||||||
kind: ImagePolicy
|
|
||||||
metadata:
|
|
||||||
name: metis
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
imageRepositoryRef:
|
|
||||||
name: metis
|
|
||||||
policy:
|
|
||||||
semver:
|
|
||||||
range: ">=0.1.0-0"
|
|
||||||
---
|
|
||||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
|
||||||
kind: ImageRepository
|
|
||||||
metadata:
|
|
||||||
name: metis-sentinel
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
image: registry.bstein.dev/bstein/metis-sentinel
|
|
||||||
interval: 1m0s
|
|
||||||
secretRef:
|
|
||||||
name: harbor-regcred
|
|
||||||
---
|
|
||||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
|
||||||
kind: ImagePolicy
|
|
||||||
metadata:
|
|
||||||
name: metis-sentinel
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
imageRepositoryRef:
|
|
||||||
name: metis-sentinel
|
|
||||||
policy:
|
|
||||||
semver:
|
|
||||||
range: ">=0.1.0-0"
|
|
||||||
---
|
|
||||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
|
||||||
kind: ImageRepository
|
|
||||||
metadata:
|
|
||||||
name: soteria
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
image: registry.bstein.dev/bstein/soteria
|
|
||||||
interval: 1m0s
|
|
||||||
secretRef:
|
|
||||||
name: harbor-regcred
|
|
||||||
---
|
|
||||||
apiVersion: image.toolkit.fluxcd.io/v1beta2
|
|
||||||
kind: ImagePolicy
|
|
||||||
metadata:
|
|
||||||
name: soteria
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
imageRepositoryRef:
|
|
||||||
name: soteria
|
|
||||||
policy:
|
|
||||||
semver:
|
|
||||||
range: ">=0.1.0-0"
|
|
||||||
|
|||||||
@ -5,43 +5,28 @@ resources:
|
|||||||
- namespace.yaml
|
- namespace.yaml
|
||||||
- image.yaml
|
- image.yaml
|
||||||
- secretproviderclass.yaml
|
- secretproviderclass.yaml
|
||||||
- metis-configmap.yaml
|
|
||||||
- metis-data-pvc.yaml
|
|
||||||
- vault-serviceaccount.yaml
|
- vault-serviceaccount.yaml
|
||||||
- vault-sync-deployment.yaml
|
- vault-sync-deployment.yaml
|
||||||
- ariadne-serviceaccount.yaml
|
- ariadne-serviceaccount.yaml
|
||||||
- ariadne-rbac.yaml
|
- ariadne-rbac.yaml
|
||||||
- disable-k3s-traefik-serviceaccount.yaml
|
- disable-k3s-traefik-serviceaccount.yaml
|
||||||
- k3s-traefik-cleanup-rbac.yaml
|
- k3s-traefik-cleanup-rbac.yaml
|
||||||
- metis-serviceaccount.yaml
|
|
||||||
- metis-rbac.yaml
|
|
||||||
- metis-token-sync-serviceaccount.yaml
|
|
||||||
- metis-token-sync-rbac.yaml
|
|
||||||
- node-nofile-serviceaccount.yaml
|
- node-nofile-serviceaccount.yaml
|
||||||
- pod-cleaner-rbac.yaml
|
- pod-cleaner-rbac.yaml
|
||||||
- ariadne-deployment.yaml
|
- ariadne-deployment.yaml
|
||||||
- metis-deployment.yaml
|
|
||||||
- oneoffs/ariadne-migrate-job.yaml
|
- oneoffs/ariadne-migrate-job.yaml
|
||||||
- ariadne-service.yaml
|
- ariadne-service.yaml
|
||||||
- disable-k3s-traefik-daemonset.yaml
|
- disable-k3s-traefik-daemonset.yaml
|
||||||
- oneoffs/k3s-traefik-cleanup-job.yaml
|
- oneoffs/k3s-traefik-cleanup-job.yaml
|
||||||
- node-nofile-daemonset.yaml
|
- node-nofile-daemonset.yaml
|
||||||
- metis-sentinel-daemonset.yaml
|
|
||||||
- metis-k3s-token-sync-cronjob.yaml
|
|
||||||
- k3s-agent-restart-daemonset.yaml
|
- k3s-agent-restart-daemonset.yaml
|
||||||
- pod-cleaner-cronjob.yaml
|
- pod-cleaner-cronjob.yaml
|
||||||
- node-image-sweeper-serviceaccount.yaml
|
- node-image-sweeper-serviceaccount.yaml
|
||||||
- node-image-sweeper-daemonset.yaml
|
- node-image-sweeper-daemonset.yaml
|
||||||
- image-sweeper-cronjob.yaml
|
- image-sweeper-cronjob.yaml
|
||||||
- metis-service.yaml
|
|
||||||
- metis-ingress.yaml
|
|
||||||
images:
|
images:
|
||||||
- name: registry.bstein.dev/bstein/ariadne
|
- name: registry.bstein.dev/bstein/ariadne
|
||||||
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
|
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
|
||||||
- name: registry.bstein.dev/bstein/metis
|
|
||||||
newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:metis:tag"}
|
|
||||||
- name: registry.bstein.dev/bstein/metis-sentinel
|
|
||||||
newTag: 0.1.0-0 # {"$imagepolicy": "maintenance:metis-sentinel:tag"}
|
|
||||||
configMapGenerator:
|
configMapGenerator:
|
||||||
- name: disable-k3s-traefik-script
|
- name: disable-k3s-traefik-script
|
||||||
namespace: maintenance
|
namespace: maintenance
|
||||||
|
|||||||
@ -1,20 +0,0 @@
|
|||||||
# services/maintenance/metis-configmap.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: metis
|
|
||||||
namespace: maintenance
|
|
||||||
data:
|
|
||||||
METIS_BIND_ADDR: :8080
|
|
||||||
METIS_INVENTORY_PATH: /app/inventory.titan-rpi4.yaml
|
|
||||||
METIS_DATA_DIR: /var/lib/metis
|
|
||||||
METIS_DEFAULT_FLASH_HOST: titan-22
|
|
||||||
METIS_FLASH_HOSTS: titan-22
|
|
||||||
METIS_LOCAL_HOST: titan-22
|
|
||||||
METIS_ALLOWED_GROUPS: admin,maintainer
|
|
||||||
METIS_MAX_DEVICE_BYTES: "300000000000"
|
|
||||||
METIS_SENTINEL_PUSH_URL: http://metis.maintenance.svc.cluster.local/internal/sentinel/snapshot
|
|
||||||
METIS_SENTINEL_INTERVAL_SEC: "1800"
|
|
||||||
METIS_SENTINEL_NSENTER: "1"
|
|
||||||
METIS_IMAGE_RPI4_ARMBIAN_LONGHORN: https://armbian.chi.auroradev.org/dl/rpi4b/archive/Armbian_26.2.1_Rpi4b_noble_current_6.18.9_minimal.img.xz
|
|
||||||
METIS_IMAGE_RPI4_ARMBIAN_LONGHORN_SHA256: sha256:c450687adf4cc6a59725c43aefd58baf42ec71bdd379227d403cdde281768e46
|
|
||||||
@ -1,13 +0,0 @@
|
|||||||
# services/maintenance/metis-data-pvc.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: metis-data
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: 40Gi
|
|
||||||
storageClassName: local-path
|
|
||||||
@ -1,47 +0,0 @@
|
|||||||
# services/maintenance/metis-deployment.yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: metis
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
revisionHistoryLimit: 3
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: metis
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: metis
|
|
||||||
annotations:
|
|
||||||
prometheus.io/scrape: "true"
|
|
||||||
prometheus.io/port: "8080"
|
|
||||||
prometheus.io/path: "/metrics"
|
|
||||||
spec:
|
|
||||||
serviceAccountName: metis
|
|
||||||
nodeSelector:
|
|
||||||
kubernetes.io/hostname: titan-22
|
|
||||||
kubernetes.io/arch: amd64
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
containers:
|
|
||||||
- name: metis
|
|
||||||
image: registry.bstein.dev/bstein/metis:latest
|
|
||||||
imagePullPolicy: Always
|
|
||||||
envFrom:
|
|
||||||
- configMapRef:
|
|
||||||
name: metis
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
containerPort: 8080
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 100m
|
|
||||||
memory: 128Mi
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 512Mi
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
capabilities:
|
|
||||||
drop: ["ALL"]
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
# services/maintenance/metis-ingress.yaml
|
|
||||||
apiVersion: networking.k8s.io/v1
|
|
||||||
kind: Ingress
|
|
||||||
metadata:
|
|
||||||
name: metis
|
|
||||||
namespace: maintenance
|
|
||||||
annotations:
|
|
||||||
kubernetes.io/ingress.class: traefik
|
|
||||||
cert-manager.io/cluster-issuer: letsencrypt
|
|
||||||
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
|
||||||
traefik.ingress.kubernetes.io/router.tls: "true"
|
|
||||||
traefik.ingress.kubernetes.io/router.middlewares: sso-oauth2-proxy-forward-auth@kubernetescrd
|
|
||||||
spec:
|
|
||||||
tls:
|
|
||||||
- hosts: ["metis.bstein.dev"]
|
|
||||||
secretName: metis-tls
|
|
||||||
rules:
|
|
||||||
- host: metis.bstein.dev
|
|
||||||
http:
|
|
||||||
paths:
|
|
||||||
- path: /
|
|
||||||
pathType: Prefix
|
|
||||||
backend:
|
|
||||||
service:
|
|
||||||
name: metis
|
|
||||||
port:
|
|
||||||
number: 80
|
|
||||||
@ -1,51 +0,0 @@
|
|||||||
# services/maintenance/metis-k3s-token-sync-cronjob.yaml
|
|
||||||
apiVersion: batch/v1
|
|
||||||
kind: CronJob
|
|
||||||
metadata:
|
|
||||||
name: metis-k3s-token-sync
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
schedule: "11 */6 * * *"
|
|
||||||
concurrencyPolicy: Forbid
|
|
||||||
successfulJobsHistoryLimit: 1
|
|
||||||
failedJobsHistoryLimit: 2
|
|
||||||
jobTemplate:
|
|
||||||
spec:
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
serviceAccountName: metis-token-sync
|
|
||||||
restartPolicy: OnFailure
|
|
||||||
nodeSelector:
|
|
||||||
kubernetes.io/arch: arm64
|
|
||||||
node-role.kubernetes.io/control-plane: "true"
|
|
||||||
tolerations:
|
|
||||||
- key: node-role.kubernetes.io/control-plane
|
|
||||||
operator: Exists
|
|
||||||
effect: NoSchedule
|
|
||||||
- key: node-role.kubernetes.io/master
|
|
||||||
operator: Exists
|
|
||||||
effect: NoSchedule
|
|
||||||
containers:
|
|
||||||
- name: sync
|
|
||||||
image: registry.bstein.dev/bstein/kubectl:1.35.0
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
command:
|
|
||||||
- /bin/sh
|
|
||||||
- -c
|
|
||||||
args:
|
|
||||||
- |
|
|
||||||
set -euo pipefail
|
|
||||||
token="$(tr -d '\n' < /host/var/lib/rancher/k3s/server/node-token)"
|
|
||||||
kubectl -n maintenance create secret generic metis-runtime \
|
|
||||||
--from-literal=k3s_token="${token}" \
|
|
||||||
--dry-run=client -o yaml | kubectl apply -f -
|
|
||||||
securityContext:
|
|
||||||
runAsUser: 0
|
|
||||||
volumeMounts:
|
|
||||||
- name: k3s-server
|
|
||||||
mountPath: /host/var/lib/rancher/k3s/server
|
|
||||||
readOnly: true
|
|
||||||
volumes:
|
|
||||||
- name: k3s-server
|
|
||||||
hostPath:
|
|
||||||
path: /var/lib/rancher/k3s/server
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
# services/maintenance/metis-rbac.yaml
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRole
|
|
||||||
metadata:
|
|
||||||
name: metis-node-manager
|
|
||||||
rules:
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- nodes
|
|
||||||
verbs:
|
|
||||||
- get
|
|
||||||
- list
|
|
||||||
- watch
|
|
||||||
- delete
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRoleBinding
|
|
||||||
metadata:
|
|
||||||
name: metis-node-manager
|
|
||||||
subjects:
|
|
||||||
- kind: ServiceAccount
|
|
||||||
name: metis
|
|
||||||
namespace: maintenance
|
|
||||||
roleRef:
|
|
||||||
apiGroup: rbac.authorization.k8s.io
|
|
||||||
kind: ClusterRole
|
|
||||||
name: metis-node-manager
|
|
||||||
@ -1,133 +0,0 @@
|
|||||||
# services/maintenance/metis-sentinel-daemonset.yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: DaemonSet
|
|
||||||
metadata:
|
|
||||||
name: metis-sentinel
|
|
||||||
namespace: maintenance
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: metis-sentinel
|
|
||||||
updateStrategy:
|
|
||||||
type: RollingUpdate
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: metis-sentinel
|
|
||||||
annotations:
|
|
||||||
prometheus.io/scrape: "true"
|
|
||||||
prometheus.io/port: "8080"
|
|
||||||
prometheus.io/path: "/metrics"
|
|
||||||
spec:
|
|
||||||
serviceAccountName: metis
|
|
||||||
nodeSelector:
|
|
||||||
kubernetes.io/os: linux
|
|
||||||
node-role.kubernetes.io/worker: "true"
|
|
||||||
containers:
|
|
||||||
- name: metis-sentinel
|
|
||||||
image: registry.bstein.dev/bstein/metis-sentinel:latest
|
|
||||||
imagePullPolicy: Always
|
|
||||||
command:
|
|
||||||
- /bin/sh
|
|
||||||
- -c
|
|
||||||
args:
|
|
||||||
- |
|
|
||||||
set -eu
|
|
||||||
out_dir="${METIS_SENTINEL_OUT:-/var/run/metis-sentinel}"
|
|
||||||
interval="${METIS_SENTINEL_INTERVAL_SEC:-120}"
|
|
||||||
mkdir -p "${out_dir}"
|
|
||||||
while true; do
|
|
||||||
ts="$(date -u +%Y%m%dT%H%M%SZ)"
|
|
||||||
node="${METIS_SENTINEL_NODE:-unknown}"
|
|
||||||
tmp="${out_dir}/${node}-${ts}.json.tmp"
|
|
||||||
out="${out_dir}/${node}-${ts}.json"
|
|
||||||
if metis-sentinel > "${tmp}"; then
|
|
||||||
mv "${tmp}" "${out}"
|
|
||||||
else
|
|
||||||
rm -f "${tmp}" || true
|
|
||||||
fi
|
|
||||||
sleep "${interval}"
|
|
||||||
done
|
|
||||||
envFrom:
|
|
||||||
- configMapRef:
|
|
||||||
name: metis
|
|
||||||
env:
|
|
||||||
- name: METIS_SENTINEL_NODE
|
|
||||||
valueFrom:
|
|
||||||
fieldRef:
|
|
||||||
fieldPath: spec.nodeName
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
containerPort: 8080
|
|
||||||
volumeMounts:
|
|
||||||
- name: sentinel-output
|
|
||||||
mountPath: /var/run/metis-sentinel
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 25m
|
|
||||||
memory: 64Mi
|
|
||||||
limits:
|
|
||||||
cpu: 250m
|
|
||||||
memory: 256Mi
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
runAsUser: 0
|
|
||||||
capabilities:
|
|
||||||
drop: ["ALL"]
|
|
||||||
- name: sentinel-pusher
|
|
||||||
image: curlimages/curl:8.12.1
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
command:
|
|
||||||
- /bin/sh
|
|
||||||
- -c
|
|
||||||
args:
|
|
||||||
- |
|
|
||||||
set -eu
|
|
||||||
out_dir="${METIS_SENTINEL_OUT:-/var/run/metis-sentinel}"
|
|
||||||
push_url="${METIS_SENTINEL_PUSH_URL:-}"
|
|
||||||
interval="${METIS_SENTINEL_PUSH_INTERVAL_SEC:-120}"
|
|
||||||
timeout="${METIS_SENTINEL_PUSH_TIMEOUT_SEC:-10}"
|
|
||||||
mkdir -p "${out_dir}"
|
|
||||||
while true; do
|
|
||||||
for snapshot in "${out_dir}"/*.json; do
|
|
||||||
[ -f "${snapshot}" ] || continue
|
|
||||||
if [ -z "${push_url}" ]; then
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
if curl -fsS --connect-timeout "${timeout}" --max-time "${timeout}" \
|
|
||||||
-X POST \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-H "X-Metis-Node: ${METIS_SENTINEL_NODE:-unknown}" \
|
|
||||||
--data-binary "@${snapshot}" \
|
|
||||||
"${push_url}"; then
|
|
||||||
rm -f "${snapshot}"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
sleep "${interval}"
|
|
||||||
done
|
|
||||||
envFrom:
|
|
||||||
- configMapRef:
|
|
||||||
name: metis
|
|
||||||
env:
|
|
||||||
- name: METIS_SENTINEL_NODE
|
|
||||||
valueFrom:
|
|
||||||
fieldRef:
|
|
||||||
fieldPath: spec.nodeName
|
|
||||||
volumeMounts:
|
|
||||||
- name: sentinel-output
|
|
||||||
mountPath: /var/run/metis-sentinel
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 10m
|
|
||||||
memory: 32Mi
|
|
||||||
limits:
|
|
||||||
cpu: 100m
|
|
||||||
memory: 128Mi
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
runAsUser: 0
|
|
||||||
capabilities:
|
|
||||||
drop: ["ALL"]
|
|
||||||
volumes:
|
|
||||||
- name: sentinel-output
|
|
||||||
emptyDir: {}
|
|
||||||
@ -1,18 +0,0 @@
|
|||||||
# services/maintenance/metis-service.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: metis
|
|
||||||
namespace: maintenance
|
|
||||||
annotations:
|
|
||||||
prometheus.io/scrape: "true"
|
|
||||||
prometheus.io/port: "80"
|
|
||||||
prometheus.io/path: "/metrics"
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
selector:
|
|
||||||
app: metis
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 80
|
|
||||||
targetPort: http
|
|
||||||
@ -1,6 +0,0 @@
|
|||||||
# services/maintenance/metis-serviceaccount.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: metis
|
|
||||||
namespace: maintenance
|
|
||||||
@ -1,30 +0,0 @@
|
|||||||
# services/maintenance/metis-token-sync-rbac.yaml
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: Role
|
|
||||||
metadata:
|
|
||||||
name: metis-token-sync
|
|
||||||
namespace: maintenance
|
|
||||||
rules:
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- secrets
|
|
||||||
verbs:
|
|
||||||
- get
|
|
||||||
- list
|
|
||||||
- create
|
|
||||||
- update
|
|
||||||
- patch
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: RoleBinding
|
|
||||||
metadata:
|
|
||||||
name: metis-token-sync
|
|
||||||
namespace: maintenance
|
|
||||||
subjects:
|
|
||||||
- kind: ServiceAccount
|
|
||||||
name: metis-token-sync
|
|
||||||
namespace: maintenance
|
|
||||||
roleRef:
|
|
||||||
apiGroup: rbac.authorization.k8s.io
|
|
||||||
kind: Role
|
|
||||||
name: metis-token-sync
|
|
||||||
@ -1,6 +0,0 @@
|
|||||||
# services/maintenance/metis-token-sync-serviceaccount.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: metis-token-sync
|
|
||||||
namespace: maintenance
|
|
||||||
@ -1125,7 +1125,7 @@
|
|||||||
{
|
{
|
||||||
"id": 17,
|
"id": 17,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Platform CI Coverage (%)",
|
"title": "Ariadne CI Coverage (%)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1138,7 +1138,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
|
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{branch}}",
|
"legendFormat": "{{branch}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -1183,13 +1183,12 @@
|
|||||||
"values": false
|
"values": false
|
||||||
},
|
},
|
||||||
"textMode": "value"
|
"textMode": "value"
|
||||||
},
|
}
|
||||||
"description": "Internal source panel for Atlas Overview automation test rollups."
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 18,
|
"id": 18,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Platform CI Tests (latest)",
|
"title": "Ariadne CI Tests (latest)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1202,7 +1201,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
|
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -1234,8 +1233,7 @@
|
|||||||
"order": "desc"
|
"order": "desc"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
]
|
||||||
"description": "Atlas Overview test panels depend on these internal repo-tagged CI series."
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -1677,7 +1677,7 @@
|
|||||||
{
|
{
|
||||||
"id": 42,
|
"id": 42,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Platform Test Success Rate",
|
"title": "Ariadne Test Success Rate",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1690,7 +1690,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
|
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -1709,13 +1709,12 @@
|
|||||||
"tooltip": {
|
"tooltip": {
|
||||||
"mode": "multi"
|
"mode": "multi"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. Add new test series there first so they roll up here."
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 43,
|
"id": 43,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Platform Tests with Failures (24h)",
|
"title": "Tests with Failures (24h)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1728,7 +1727,7 @@
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
|
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{result}}",
|
"legendFormat": "{{result}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -1815,8 +1814,7 @@
|
|||||||
"order": "desc"
|
"order": "desc"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
]
|
||||||
"description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 11,
|
"id": 11,
|
||||||
|
|||||||
@ -1134,7 +1134,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 17,
|
"id": 17,
|
||||||
"type": "stat",
|
"type": "stat",
|
||||||
"title": "Platform CI Coverage (%)",
|
"title": "Ariadne CI Coverage (%)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1147,7 +1147,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
|
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{branch}}",
|
"legendFormat": "{{branch}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -1192,13 +1192,12 @@ data:
|
|||||||
"values": false
|
"values": false
|
||||||
},
|
},
|
||||||
"textMode": "value"
|
"textMode": "value"
|
||||||
},
|
}
|
||||||
"description": "Internal source panel for Atlas Overview automation test rollups."
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 18,
|
"id": 18,
|
||||||
"type": "table",
|
"type": "table",
|
||||||
"title": "Platform CI Tests (latest)",
|
"title": "Ariadne CI Tests (latest)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1211,7 +1210,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
|
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"instant": true
|
"instant": true
|
||||||
}
|
}
|
||||||
@ -1243,8 +1242,7 @@ data:
|
|||||||
"order": "desc"
|
"order": "desc"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
]
|
||||||
"description": "Atlas Overview test panels depend on these internal repo-tagged CI series."
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"time": {
|
"time": {
|
||||||
|
|||||||
@ -1686,7 +1686,7 @@ data:
|
|||||||
{
|
{
|
||||||
"id": 42,
|
"id": 42,
|
||||||
"type": "timeseries",
|
"type": "timeseries",
|
||||||
"title": "Platform Test Success Rate",
|
"title": "Ariadne Test Success Rate",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1699,7 +1699,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
|
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -1718,13 +1718,12 @@ data:
|
|||||||
"tooltip": {
|
"tooltip": {
|
||||||
"mode": "multi"
|
"mode": "multi"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. Add new test series there first so they roll up here."
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 43,
|
"id": 43,
|
||||||
"type": "bargauge",
|
"type": "bargauge",
|
||||||
"title": "Platform Tests with Failures (24h)",
|
"title": "Tests with Failures (24h)",
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
"uid": "atlas-vm"
|
"uid": "atlas-vm"
|
||||||
@ -1737,7 +1736,7 @@ data:
|
|||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
|
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"legendFormat": "{{result}}",
|
"legendFormat": "{{result}}",
|
||||||
"instant": true
|
"instant": true
|
||||||
@ -1824,8 +1823,7 @@ data:
|
|||||||
"order": "desc"
|
"order": "desc"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
]
|
||||||
"description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 11,
|
"id": 11,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user