diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 5b4c96df..3be13f10 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -1295,48 +1295,53 @@ def build_overview(): }, } ) - panels.append( - timeseries_panel( - 42, - "Ariadne + Metis Test Success Rate", - TEST_SUCCESS_RATE, - {"h": 6, "w": 6, "x": 12, "y": 14}, - unit="percent", - max_value=100, - legend=None, - legend_display="list", - ) + test_success = timeseries_panel( + 42, + "Platform Test Success Rate", + TEST_SUCCESS_RATE, + {"h": 6, "w": 6, "x": 12, "y": 14}, + unit="percent", + max_value=100, + legend=None, + legend_display="list", ) - panels.append( - bargauge_panel( - 43, - "Ariadne + Metis Tests with Failures (24h)", - TEST_FAILURES_24H, - {"h": 6, "w": 6, "x": 18, "y": 14}, - unit="none", - instant=True, - legend="{{result}}", - overrides=[ - { - "matcher": {"id": "byName", "options": "error"}, - "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}], - }, - { - "matcher": {"id": "byName", "options": "failed"}, - "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}], - }, - ], - thresholds={ - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 1}, - {"color": "orange", "value": 5}, - {"color": "red", "value": 10}, - ], + test_success["description"] = ( + "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. " + "Add new test series there first so they roll up here." + ) + panels.append(test_success) + test_failures = bargauge_panel( + 43, + "Platform Tests with Failures (24h)", + TEST_FAILURES_24H, + {"h": 6, "w": 6, "x": 18, "y": 14}, + unit="none", + instant=True, + legend="{{result}}", + overrides=[ + { + "matcher": {"id": "byName", "options": "error"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}], }, - ) + { + "matcher": {"id": "byName", "options": "failed"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}], + }, + ], + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "orange", "value": 5}, + {"color": "red", "value": 10}, + ], + }, ) + test_failures["description"] = ( + "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query." + ) + panels.append(test_failures) cpu_scope = "$namespace_scope_cpu" gpu_scope = "$namespace_scope_gpu" @@ -2654,29 +2659,31 @@ def build_jobs_dashboard(): legend="{{status}}", ) ) - panels.append( - stat_panel( - 17, - "Ariadne + Metis CI Coverage (%)", - TEST_CI_COVERAGE, - {"h": 6, "w": 4, "x": 8, "y": 11}, - unit="percent", - decimals=1, - instant=True, - legend="{{branch}}", - ) + coverage_panel = stat_panel( + 17, + "Platform CI Coverage (%)", + TEST_CI_COVERAGE, + {"h": 6, "w": 4, "x": 8, "y": 11}, + unit="percent", + decimals=1, + instant=True, + legend="{{branch}}", ) - panels.append( - table_panel( - 18, - "Ariadne + Metis CI Tests (latest)", - TEST_CI_TESTS, - {"h": 6, "w": 12, "x": 12, "y": 11}, - unit="none", - transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], - instant=True, - ) + coverage_panel["description"] = "Internal source panel for Atlas Overview automation test rollups." + panels.append(coverage_panel) + tests_panel = table_panel( + 18, + "Platform CI Tests (latest)", + TEST_CI_TESTS, + {"h": 6, "w": 12, "x": 12, "y": 11}, + unit="none", + transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], + instant=True, ) + tests_panel["description"] = ( + "Atlas Overview test panels depend on these internal repo-tagged CI series." + ) + panels.append(tests_panel) return { "uid": "atlas-jobs", diff --git a/services/harbor/helmrelease.yaml b/services/harbor/helmrelease.yaml index 2b51f40a..40ac51f0 100644 --- a/services/harbor/helmrelease.yaml +++ b/services/harbor/helmrelease.yaml @@ -437,8 +437,7 @@ spec: - $patch: replace - name: VAULT_ENV_FILE value: /vault/secrets/harbor-jobservice-env.sh - envFrom: - - $patch: replace + envFrom: [] - configMapRef: name: harbor-jobservice-env volumeMounts: diff --git a/services/jenkins/configmap-jcasc.yaml b/services/jenkins/configmap-jcasc.yaml index d0efa4ba..65bafbff 100644 --- a/services/jenkins/configmap-jcasc.yaml +++ b/services/jenkins/configmap-jcasc.yaml @@ -193,6 +193,32 @@ data: } } } + pipelineJob('metis') { + properties { + pipelineTriggers { + triggers { + scmTrigger { + scmpoll_spec('H/5 * * * *') + ignorePostCommitHooks(false) + } + } + } + } + definition { + cpsScm { + scm { + git { + remote { + url('https://scm.bstein.dev/bstein/metis.git') + credentials('gitea-pat') + } + branches('*/master') + } + } + scriptPath('Jenkinsfile') + } + } + } pipelineJob('atlasbot') { properties { pipelineTriggers { diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index a0620b06..390e5b36 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -302,11 +302,11 @@ spec: - name: ARIADNE_SCHEDULE_FIREFLY_CRON value: "0 3 * * *" - name: ARIADNE_SCHEDULE_POD_CLEANER - value: "0 * * * *" + value: "*/30 * * * *" - name: ARIADNE_SCHEDULE_OPENSEARCH_PRUNE value: "23 3 * * *" - name: ARIADNE_SCHEDULE_IMAGE_SWEEPER - value: "30 4 * * *" + value: "0 */4 * * *" - name: ARIADNE_SCHEDULE_VAULT_K8S_AUTH value: "*/15 * * * *" - name: ARIADNE_SCHEDULE_VAULT_OIDC @@ -320,9 +320,9 @@ spec: - name: ARIADNE_SCHEDULE_COMMS_SEED_ROOM value: "*/10 * * * *" - name: ARIADNE_SCHEDULE_CLUSTER_STATE - value: "*/15 * * * *" + value: "*/10 * * * *" - name: ARIADNE_CLUSTER_STATE_KEEP - value: "168" + value: "720" - name: WELCOME_EMAIL_ENABLED value: "true" - name: K8S_API_TIMEOUT_SEC @@ -339,6 +339,12 @@ spec: value: "1099511627776" - name: OPENSEARCH_INDEX_PATTERNS value: kube-*,journald-*,trace-analytics-* + - name: METIS_BASE_URL + value: http://metis.maintenance.svc.cluster.local + - name: METIS_TIMEOUT_SEC + value: "15" + - name: ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH + value: "*/30 * * * *" - name: METRICS_PATH value: "/metrics" resources: diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index 0280e6f3..8d3af4b2 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -7,6 +7,7 @@ resources: - secretproviderclass.yaml - soteria-configmap.yaml - metis-configmap.yaml + - metis-data-pvc.yaml - vault-serviceaccount.yaml - vault-sync-deployment.yaml - ariadne-serviceaccount.yaml @@ -14,6 +15,9 @@ resources: - disable-k3s-traefik-serviceaccount.yaml - k3s-traefik-cleanup-rbac.yaml - metis-serviceaccount.yaml + - metis-rbac.yaml + - metis-token-sync-serviceaccount.yaml + - metis-token-sync-rbac.yaml - node-nofile-serviceaccount.yaml - pod-cleaner-rbac.yaml - soteria-serviceaccount.yaml @@ -27,6 +31,7 @@ resources: - oneoffs/k3s-traefik-cleanup-job.yaml - node-nofile-daemonset.yaml - metis-sentinel-daemonset.yaml + - metis-k3s-token-sync-cronjob.yaml - k3s-agent-restart-daemonset.yaml - pod-cleaner-cronjob.yaml - node-image-sweeper-serviceaccount.yaml diff --git a/services/maintenance/metis-configmap.yaml b/services/maintenance/metis-configmap.yaml index 8cc5928a..5e6af487 100644 --- a/services/maintenance/metis-configmap.yaml +++ b/services/maintenance/metis-configmap.yaml @@ -5,12 +5,16 @@ metadata: name: metis namespace: maintenance data: - METIS_DEFAULT_FLASH_NODE: titan-22 - METIS_UI_BASE_URL: https://metis.bstein.dev - METIS_METRICS_PORT: "8080" - METIS_METRICS_PATH: /metrics - METIS_SENTINEL_PUSH_URL: http://metis.maintenance.svc.cluster.local/api/internal/sentinel/snapshots - METIS_SENTINEL_PUSH_TIMEOUT_SEC: "10" - METIS_SENTINEL_PUSH_INTERVAL_SEC: "120" - METIS_SENTINEL_OUT: /var/run/metis-sentinel - METIS_SENTINEL_INTERVAL_SEC: "120" + METIS_BIND_ADDR: :8080 + METIS_INVENTORY_PATH: /app/inventory.titan-rpi4.yaml + METIS_DATA_DIR: /var/lib/metis + METIS_DEFAULT_FLASH_HOST: titan-22 + METIS_FLASH_HOSTS: titan-22 + METIS_LOCAL_HOST: titan-22 + METIS_ALLOWED_GROUPS: admin,maintainer + METIS_MAX_DEVICE_BYTES: "300000000000" + METIS_SENTINEL_PUSH_URL: http://metis.maintenance.svc.cluster.local/internal/sentinel/snapshot + METIS_SENTINEL_INTERVAL_SEC: "1800" + METIS_SENTINEL_NSENTER: "1" + METIS_IMAGE_RPI4_ARMBIAN_LONGHORN: https://armbian.chi.auroradev.org/dl/rpi4b/archive/Armbian_26.2.1_Rpi4b_noble_current_6.18.9_minimal.img.xz + METIS_IMAGE_RPI4_ARMBIAN_LONGHORN_SHA256: sha256:c450687adf4cc6a59725c43aefd58baf42ec71bdd379227d403cdde281768e46 diff --git a/services/maintenance/metis-data-pvc.yaml b/services/maintenance/metis-data-pvc.yaml new file mode 100644 index 00000000..9afc7138 --- /dev/null +++ b/services/maintenance/metis-data-pvc.yaml @@ -0,0 +1,13 @@ +# services/maintenance/metis-data-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: metis-data + namespace: maintenance +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 40Gi + storageClassName: local-path diff --git a/services/maintenance/metis-k3s-token-sync-cronjob.yaml b/services/maintenance/metis-k3s-token-sync-cronjob.yaml new file mode 100644 index 00000000..9c5a3a58 --- /dev/null +++ b/services/maintenance/metis-k3s-token-sync-cronjob.yaml @@ -0,0 +1,51 @@ +# services/maintenance/metis-k3s-token-sync-cronjob.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: metis-k3s-token-sync + namespace: maintenance +spec: + schedule: "11 */6 * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 2 + jobTemplate: + spec: + template: + spec: + serviceAccountName: metis-token-sync + restartPolicy: OnFailure + nodeSelector: + kubernetes.io/arch: arm64 + node-role.kubernetes.io/control-plane: "true" + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + containers: + - name: sync + image: registry.bstein.dev/bstein/kubectl:1.35.0 + imagePullPolicy: IfNotPresent + command: + - /bin/sh + - -c + args: + - | + set -euo pipefail + token="$(tr -d '\n' < /host/var/lib/rancher/k3s/server/node-token)" + kubectl -n maintenance create secret generic metis-runtime \ + --from-literal=k3s_token="${token}" \ + --dry-run=client -o yaml | kubectl apply -f - + securityContext: + runAsUser: 0 + volumeMounts: + - name: k3s-server + mountPath: /host/var/lib/rancher/k3s/server + readOnly: true + volumes: + - name: k3s-server + hostPath: + path: /var/lib/rancher/k3s/server diff --git a/services/maintenance/metis-rbac.yaml b/services/maintenance/metis-rbac.yaml new file mode 100644 index 00000000..8b922514 --- /dev/null +++ b/services/maintenance/metis-rbac.yaml @@ -0,0 +1,27 @@ +# services/maintenance/metis-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: metis-node-manager +rules: + - apiGroups: [""] + resources: + - nodes + verbs: + - get + - list + - watch + - delete +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: metis-node-manager +subjects: + - kind: ServiceAccount + name: metis + namespace: maintenance +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: metis-node-manager diff --git a/services/maintenance/metis-token-sync-rbac.yaml b/services/maintenance/metis-token-sync-rbac.yaml new file mode 100644 index 00000000..86da52b4 --- /dev/null +++ b/services/maintenance/metis-token-sync-rbac.yaml @@ -0,0 +1,30 @@ +# services/maintenance/metis-token-sync-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: metis-token-sync + namespace: maintenance +rules: + - apiGroups: [""] + resources: + - secrets + verbs: + - get + - list + - create + - update + - patch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: metis-token-sync + namespace: maintenance +subjects: + - kind: ServiceAccount + name: metis-token-sync + namespace: maintenance +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: metis-token-sync diff --git a/services/maintenance/metis-token-sync-serviceaccount.yaml b/services/maintenance/metis-token-sync-serviceaccount.yaml new file mode 100644 index 00000000..55438dcc --- /dev/null +++ b/services/maintenance/metis-token-sync-serviceaccount.yaml @@ -0,0 +1,6 @@ +# services/maintenance/metis-token-sync-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: metis-token-sync + namespace: maintenance diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index f4a59a0d..be633f69 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -1125,7 +1125,7 @@ { "id": 17, "type": "stat", - "title": "Ariadne + Metis CI Coverage (%)", + "title": "Platform CI Coverage (%)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1183,12 +1183,13 @@ "values": false }, "textMode": "value" - } + }, + "description": "Internal source panel for Atlas Overview automation test rollups." }, { "id": 18, "type": "table", - "title": "Ariadne + Metis CI Tests (latest)", + "title": "Platform CI Tests (latest)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1233,7 +1234,8 @@ "order": "desc" } } - ] + ], + "description": "Atlas Overview test panels depend on these internal repo-tagged CI series." } ], "time": { diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 3c512dee..98da39aa 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -1677,7 +1677,7 @@ { "id": 42, "type": "timeseries", - "title": "Ariadne + Metis Test Success Rate", + "title": "Platform Test Success Rate", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1709,12 +1709,13 @@ "tooltip": { "mode": "multi" } - } + }, + "description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. Add new test series there first so they roll up here." }, { "id": 43, "type": "bargauge", - "title": "Ariadne + Metis Tests with Failures (24h)", + "title": "Platform Tests with Failures (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1814,7 +1815,8 @@ "order": "desc" } } - ] + ], + "description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query." }, { "id": 11, diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 452921a8..af6f2652 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -1134,7 +1134,7 @@ data: { "id": 17, "type": "stat", - "title": "Ariadne + Metis CI Coverage (%)", + "title": "Platform CI Coverage (%)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1192,12 +1192,13 @@ data: "values": false }, "textMode": "value" - } + }, + "description": "Internal source panel for Atlas Overview automation test rollups." }, { "id": 18, "type": "table", - "title": "Ariadne + Metis CI Tests (latest)", + "title": "Platform CI Tests (latest)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1242,7 +1243,8 @@ data: "order": "desc" } } - ] + ], + "description": "Atlas Overview test panels depend on these internal repo-tagged CI series." } ], "time": { diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index 0771db58..7e9faf45 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1686,7 +1686,7 @@ data: { "id": 42, "type": "timeseries", - "title": "Ariadne + Metis Test Success Rate", + "title": "Platform Test Success Rate", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1718,12 +1718,13 @@ data: "tooltip": { "mode": "multi" } - } + }, + "description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. Add new test series there first so they roll up here." }, { "id": 43, "type": "bargauge", - "title": "Ariadne + Metis Tests with Failures (24h)", + "title": "Platform Tests with Failures (24h)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -1823,7 +1824,8 @@ data: "order": "desc" } } - ] + ], + "description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query." }, { "id": 11,