diff --git a/ci/tests/glue/config.yaml b/ci/tests/glue/config.yaml index 5a89ac80..bae31d04 100644 --- a/ci/tests/glue/config.yaml +++ b/ci/tests/glue/config.yaml @@ -36,3 +36,17 @@ ariadne_schedule_tasks: - task: schedule.comms_seed_room check_last_success: true max_success_age_hours: 48 + - task: schedule.pod_cleaner + check_last_success: true + max_success_age_hours: 6 + - task: schedule.opensearch_prune + check_last_success: false + - task: schedule.image_sweeper + check_last_success: true + max_success_age_hours: 18 + - task: schedule.metis_k3s_token_sync + check_last_success: true + max_success_age_hours: 12 + - task: schedule.platform_quality_suite_probe + check_last_success: true + max_success_age_hours: 2 diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 4755c4b5..b1aa0bbf 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -400,6 +400,11 @@ ARIADNE_ALL_SCHEDULE_TASKS = [ "schedule.comms_pin_invite", "schedule.comms_reset_room", "schedule.comms_seed_room", + "schedule.pod_cleaner", + "schedule.opensearch_prune", + "schedule.image_sweeper", + "schedule.metis_k3s_token_sync", + "schedule.platform_quality_suite_probe", ] ARIADNE_FAST_SCHEDULE_TASKS = [ task @@ -414,6 +419,10 @@ ARIADNE_SCHEDULE_HEALTH_TASKS = [ "schedule.firefly_user_sync", "schedule.comms_guest_name", "schedule.comms_seed_room", + "schedule.pod_cleaner", + "schedule.image_sweeper", + "schedule.metis_k3s_token_sync", + "schedule.platform_quality_suite_probe", ] ARIADNE_ALL_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_ALL_SCHEDULE_TASKS)})$"' ARIADNE_FAST_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_FAST_SCHEDULE_TASKS)})$"' diff --git a/services/logging/kustomization.yaml b/services/logging/kustomization.yaml index dc487155..157c9737 100644 --- a/services/logging/kustomization.yaml +++ b/services/logging/kustomization.yaml @@ -18,7 +18,6 @@ resources: - oneoffs/opensearch-ism-job.yaml - oneoffs/opensearch-dashboards-setup-job.yaml - oneoffs/opensearch-observability-setup-job.yaml - - opensearch-prune-cronjob.yaml - fluent-bit-helmrelease.yaml - node-log-rotation-daemonset.yaml - node-image-gc-rpi4-daemonset.yaml @@ -46,12 +45,6 @@ configMapGenerator: - node_image_prune_rpi5.sh=scripts/node_image_prune_rpi5.sh options: disableNameSuffixHash: true - - name: opensearch-prune-script - namespace: logging - files: - - prune.py=scripts/opensearch_prune.py - options: - disableNameSuffixHash: true - name: opensearch-observability-script namespace: logging files: diff --git a/services/logging/opensearch-prune-cronjob.yaml b/services/logging/opensearch-prune-cronjob.yaml deleted file mode 100644 index dc0dffb2..00000000 --- a/services/logging/opensearch-prune-cronjob.yaml +++ /dev/null @@ -1,48 +0,0 @@ -# services/logging/opensearch-prune-cronjob.yaml -apiVersion: batch/v1 -kind: CronJob -metadata: - name: opensearch-prune - namespace: logging -spec: - schedule: "23 3 * * *" - suspend: true - concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 1 - failedJobsHistoryLimit: 3 - jobTemplate: - spec: - backoffLimit: 2 - template: - spec: - restartPolicy: OnFailure - nodeSelector: - node-role.kubernetes.io/worker: "true" - hardware: rpi5 - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: hardware - operator: In - values: - - rpi5 - containers: - - name: prune - image: python:3.11-alpine - command: ["python", "/scripts/prune.py"] - env: - - name: OPENSEARCH_URL - value: http://opensearch-master.logging.svc.cluster.local:9200 - - name: LOG_LIMIT_BYTES - value: "1099511627776" - - name: LOG_INDEX_PATTERNS - value: "kube-*,journald-*,trace-analytics-*" - volumeMounts: - - name: scripts - mountPath: /scripts - volumes: - - name: scripts - configMap: - name: opensearch-prune-script diff --git a/services/logging/scripts/opensearch_prune.py b/services/logging/scripts/opensearch_prune.py deleted file mode 100644 index ad84d5b5..00000000 --- a/services/logging/scripts/opensearch_prune.py +++ /dev/null @@ -1,77 +0,0 @@ -import json -import os -import re -import sys -import urllib.error -import urllib.request - -os_url = os.environ.get("OPENSEARCH_URL", "http://opensearch-master.logging.svc.cluster.local:9200").rstrip("/") -limit_bytes = int(os.environ.get("LOG_LIMIT_BYTES", str(1024**4))) -patterns = [p.strip() for p in os.environ.get("LOG_INDEX_PATTERNS", "kube-*,journald-*").split(",") if p.strip()] - -UNITS = { - "b": 1, - "kb": 1024, - "mb": 1024**2, - "gb": 1024**3, - "tb": 1024**4, -} - -def parse_size(value: str) -> int: - if not value: - return 0 - text = value.strip().lower() - if text in ("-", "0"): - return 0 - match = re.match(r"^([0-9.]+)([a-z]+)$", text) - if not match: - return 0 - number = float(match.group(1)) - unit = match.group(2) - if unit not in UNITS: - return 0 - return int(number * UNITS[unit]) - -def request_json(path: str): - url = f"{os_url}{path}" - with urllib.request.urlopen(url, timeout=30) as response: - payload = response.read().decode("utf-8") - return json.loads(payload) - -def delete_index(index: str) -> None: - url = f"{os_url}/{index}" - req = urllib.request.Request(url, method="DELETE") - with urllib.request.urlopen(req, timeout=30) as response: - _ = response.read() - print(f"deleted {index}") - -indices = [] -for pattern in patterns: - try: - data = request_json(f"/_cat/indices/{pattern}?format=json&h=index,store.size,creation.date") - except urllib.error.HTTPError as exc: - if exc.code == 404: - continue - raise - for item in data: - index = item.get("index") - if not index or index.startswith("."): - continue - size = parse_size(item.get("store.size", "")) - created = int(item.get("creation.date", "0") or 0) - indices.append({"index": index, "size": size, "created": created}) - -total = sum(item["size"] for item in indices) -print(f"total_log_bytes={total}") -if total <= limit_bytes: - print("within limit") - sys.exit(0) - -indices.sort(key=lambda item: item["created"]) -for item in indices: - if total <= limit_bytes: - break - delete_index(item["index"]) - total -= item["size"] - -print(f"remaining_log_bytes={total}") diff --git a/services/maintenance/ariadne-deployment.yaml b/services/maintenance/ariadne-deployment.yaml index 390e5b36..348d15f1 100644 --- a/services/maintenance/ariadne-deployment.yaml +++ b/services/maintenance/ariadne-deployment.yaml @@ -345,6 +345,10 @@ spec: value: "15" - name: ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH value: "*/30 * * * *" + - name: ARIADNE_SCHEDULE_METIS_K3S_TOKEN_SYNC + value: "11 */6 * * *" + - name: ARIADNE_SCHEDULE_PLATFORM_QUALITY_SUITE_PROBE + value: "*/15 * * * *" - name: METRICS_PATH value: "/metrics" resources: diff --git a/services/maintenance/image-sweeper-cronjob.yaml b/services/maintenance/image-sweeper-cronjob.yaml deleted file mode 100644 index 00392060..00000000 --- a/services/maintenance/image-sweeper-cronjob.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# services/maintenance/image-sweeper-cronjob.yaml -apiVersion: batch/v1 -kind: CronJob -metadata: - name: image-sweeper - namespace: maintenance -spec: - schedule: "30 4 * * 0" - suspend: true - concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 2 - failedJobsHistoryLimit: 2 - jobTemplate: - spec: - template: - spec: - serviceAccountName: node-image-sweeper - restartPolicy: OnFailure - nodeSelector: - kubernetes.io/os: linux - kubernetes.io/arch: arm64 - node-role.kubernetes.io/worker: "true" - tolerations: - - key: node-role.kubernetes.io/control-plane - operator: Exists - effect: NoSchedule - - key: node-role.kubernetes.io/master - operator: Exists - effect: NoSchedule - containers: - - name: image-sweeper - image: python:3.12.9-alpine3.20 - command: ["/bin/sh", "/scripts/node_image_sweeper.sh"] - env: - - name: ONE_SHOT - value: "true" - securityContext: - privileged: true - runAsUser: 0 - volumeMounts: - - name: host-root - mountPath: /host - - name: script - mountPath: /scripts - readOnly: true - volumes: - - name: host-root - hostPath: - path: / - - name: script - configMap: - name: node-image-sweeper-script - defaultMode: 0555 diff --git a/services/maintenance/kustomization.yaml b/services/maintenance/kustomization.yaml index a7b6d82a..5ff09a17 100644 --- a/services/maintenance/kustomization.yaml +++ b/services/maintenance/kustomization.yaml @@ -7,10 +7,13 @@ resources: - secretproviderclass.yaml - metis-configmap.yaml - metis-data-pvc.yaml + - soteria-configmap.yaml - vault-serviceaccount.yaml - vault-sync-deployment.yaml - ariadne-serviceaccount.yaml + - soteria-serviceaccount.yaml - ariadne-rbac.yaml + - soteria-rbac.yaml - disable-k3s-traefik-serviceaccount.yaml - disable-k3s-traefik-rbac.yaml - k3s-traefik-cleanup-rbac.yaml @@ -21,19 +24,18 @@ resources: - pod-cleaner-rbac.yaml - ariadne-deployment.yaml - metis-deployment.yaml + - soteria-deployment.yaml - oneoffs/ariadne-migrate-job.yaml - ariadne-service.yaml + - soteria-service.yaml - disable-k3s-traefik-daemonset.yaml - oneoffs/k3s-traefik-cleanup-job.yaml - node-nofile-daemonset.yaml - metis-sentinel-amd64-daemonset.yaml - metis-sentinel-arm64-daemonset.yaml - - metis-k3s-token-sync-cronjob.yaml - k3s-agent-restart-daemonset.yaml - - pod-cleaner-cronjob.yaml - node-image-sweeper-serviceaccount.yaml - node-image-sweeper-daemonset.yaml - - image-sweeper-cronjob.yaml - metis-service.yaml - oauth2-proxy-metis.yaml - metis-certificate.yaml @@ -43,6 +45,8 @@ images: newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"} - name: registry.bstein.dev/bstein/metis newTag: 0.1.0-9-amd64 + - name: registry.bstein.dev/bstein/soteria + newTag: 0.1.0-11 # {"$imagepolicy": "maintenance:soteria:tag"} configMapGenerator: - name: disable-k3s-traefik-script namespace: maintenance @@ -62,12 +66,6 @@ configMapGenerator: - node_nofile.sh=scripts/node_nofile.sh options: disableNameSuffixHash: true - - name: pod-cleaner-script - namespace: maintenance - files: - - pod_cleaner.sh=scripts/pod_cleaner.sh - options: - disableNameSuffixHash: true - name: node-image-sweeper-script namespace: maintenance files: diff --git a/services/maintenance/metis-k3s-token-sync-cronjob.yaml b/services/maintenance/metis-k3s-token-sync-cronjob.yaml deleted file mode 100644 index 7ef49087..00000000 --- a/services/maintenance/metis-k3s-token-sync-cronjob.yaml +++ /dev/null @@ -1,55 +0,0 @@ -# services/maintenance/metis-k3s-token-sync-cronjob.yaml -apiVersion: batch/v1 -kind: CronJob -metadata: - name: metis-k3s-token-sync - namespace: maintenance -spec: - schedule: "11 */6 * * *" - concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 1 - failedJobsHistoryLimit: 2 - jobTemplate: - spec: - template: - spec: - serviceAccountName: metis-token-sync - restartPolicy: OnFailure - nodeName: titan-0a - tolerations: - - key: node-role.kubernetes.io/control-plane - operator: Exists - effect: NoSchedule - - key: node-role.kubernetes.io/master - operator: Exists - effect: NoSchedule - containers: - - name: sync - image: hashicorp/vault:1.17.6 - imagePullPolicy: IfNotPresent - command: - - /bin/sh - - -c - args: - - | - set -eu - token="$(tr -d '\n' < /host/var/lib/rancher/k3s/server/token)" - jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" - VAULT_TOKEN="$(vault write -field=token auth/kubernetes/login role="${VAULT_K8S_ROLE}" jwt="${jwt}")" - export VAULT_TOKEN - vault kv put kv/atlas/maintenance/metis-runtime k3s_token="${token}" - env: - - name: VAULT_ADDR - value: http://vault.vault.svc.cluster.local:8200 - - name: VAULT_K8S_ROLE - value: maintenance-metis-token-sync - securityContext: - runAsUser: 0 - volumeMounts: - - name: k3s-server - mountPath: /host/var/lib/rancher/k3s/server - readOnly: true - volumes: - - name: k3s-server - hostPath: - path: /var/lib/rancher/k3s/server diff --git a/services/maintenance/pod-cleaner-cronjob.yaml b/services/maintenance/pod-cleaner-cronjob.yaml deleted file mode 100644 index 99d13f67..00000000 --- a/services/maintenance/pod-cleaner-cronjob.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# services/maintenance/pod-cleaner-cronjob.yaml -apiVersion: batch/v1 -kind: CronJob -metadata: - name: pod-cleaner - namespace: maintenance -spec: - schedule: "0 * * * *" - suspend: true - concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 1 - failedJobsHistoryLimit: 3 - jobTemplate: - spec: - backoffLimit: 1 - template: - spec: - serviceAccountName: pod-cleaner - restartPolicy: Never - nodeSelector: - kubernetes.io/arch: arm64 - node-role.kubernetes.io/worker: "true" - containers: - - name: cleaner - image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131 - command: ["/usr/bin/env", "bash"] - args: ["/scripts/pod_cleaner.sh"] - volumeMounts: - - name: script - mountPath: /scripts - readOnly: true - volumes: - - name: script - configMap: - name: pod-cleaner-script - defaultMode: 0555 diff --git a/services/maintenance/scripts/pod_cleaner.sh b/services/maintenance/scripts/pod_cleaner.sh deleted file mode 100644 index 2ec043e2..00000000 --- a/services/maintenance/scripts/pod_cleaner.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -for phase in Succeeded Failed; do - kubectl get pods -A --field-selector="status.phase=${phase}" \ - -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' \ - | while read -r namespace name; do - if [ -n "${namespace}" ] && [ -n "${name}" ]; then - kubectl delete pod -n "${namespace}" "${name}" --ignore-not-found --grace-period=0 --wait=false - fi - done -done diff --git a/services/monitoring/dashboards/atlas-jobs.json b/services/monitoring/dashboards/atlas-jobs.json index 95ca2b41..a54789eb 100644 --- a/services/monitoring/dashboards/atlas-jobs.json +++ b/services/monitoring/dashboards/atlas-jobs.json @@ -248,7 +248,7 @@ }, "targets": [ { - "expr": "sum((((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"})) > bool 129600)) or on() vector(0)", + "expr": "sum((((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"})) > bool 129600)) or on() vector(0)", "refId": "A" } ], @@ -316,7 +316,7 @@ }, "targets": [ { - "expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"})) or on() vector(0)", + "expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"})) or on() vector(0)", "refId": "A" } ], @@ -376,7 +376,7 @@ }, "targets": [ { - "expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"}) > bool 0)) or on() vector(0)", + "expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}) > bool 0)) or on() vector(0)", "refId": "A" } ], @@ -616,7 +616,7 @@ }, "targets": [ { - "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"}[$__range])) / 3600)", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -691,7 +691,7 @@ }, "targets": [ { - "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"}[$__range])) / 3600)", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -766,7 +766,7 @@ }, "targets": [ { - "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_seed_room)$\"}[$__range])) / 3600)", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -841,7 +841,7 @@ }, "targets": [ { - "expr": "sort_desc(((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"} - time()) / 3600))", + "expr": "sort_desc(((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"} - time()) / 3600))", "refId": "A", "legendFormat": "{{task}}", "instant": true diff --git a/services/monitoring/grafana-dashboard-jobs.yaml b/services/monitoring/grafana-dashboard-jobs.yaml index 4001aa95..dd2b019b 100644 --- a/services/monitoring/grafana-dashboard-jobs.yaml +++ b/services/monitoring/grafana-dashboard-jobs.yaml @@ -257,7 +257,7 @@ data: }, "targets": [ { - "expr": "sum((((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"})) > bool 129600)) or on() vector(0)", + "expr": "sum((((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"})) > bool 129600)) or on() vector(0)", "refId": "A" } ], @@ -325,7 +325,7 @@ data: }, "targets": [ { - "expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"})) or on() vector(0)", + "expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"})) or on() vector(0)", "refId": "A" } ], @@ -385,7 +385,7 @@ data: }, "targets": [ { - "expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"}) > bool 0)) or on() vector(0)", + "expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}) > bool 0)) or on() vector(0)", "refId": "A" } ], @@ -625,7 +625,7 @@ data: }, "targets": [ { - "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"}[$__range])) / 3600)", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -700,7 +700,7 @@ data: }, "targets": [ { - "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"}[$__range])) / 3600)", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -775,7 +775,7 @@ data: }, "targets": [ { - "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_seed_room)$\"}[$__range])) / 3600)", + "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)", "refId": "A", "legendFormat": "{{task}}", "instant": true @@ -850,7 +850,7 @@ data: }, "targets": [ { - "expr": "sort_desc(((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"} - time()) / 3600))", + "expr": "sort_desc(((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"} - time()) / 3600))", "refId": "A", "legendFormat": "{{task}}", "instant": true diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 67580f60..68d35f3f 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -23,7 +23,6 @@ resources: - platform-quality-gateway-pvc.yaml - platform-quality-gateway-service.yaml - platform-quality-gateway-deployment.yaml - - platform-quality-suite-probe-cronjob.yaml - vault-sync-deployment.yaml - grafana-alerting-config.yaml - grafana-folders.yaml diff --git a/services/monitoring/platform-quality-suite-probe-cronjob.yaml b/services/monitoring/platform-quality-suite-probe-cronjob.yaml deleted file mode 100644 index 8685ef11..00000000 --- a/services/monitoring/platform-quality-suite-probe-cronjob.yaml +++ /dev/null @@ -1,39 +0,0 @@ -# services/monitoring/platform-quality-suite-probe-cronjob.yaml -apiVersion: batch/v1 -kind: CronJob -metadata: - name: platform-quality-suite-probe - namespace: monitoring -spec: - schedule: "*/15 * * * *" - concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 2 - failedJobsHistoryLimit: 2 - jobTemplate: - spec: - backoffLimit: 0 - template: - metadata: - labels: - app: platform-quality-suite-probe - spec: - restartPolicy: Never - containers: - - name: probe - image: curlimages/curl:8.12.1 - imagePullPolicy: IfNotPresent - command: ["/bin/sh", "/scripts/platform_quality_suite_probe.sh"] - env: - - name: PUSHGATEWAY_URL - value: http://platform-quality-gateway.monitoring.svc.cluster.local:9091 - - name: HTTP_TIMEOUT_SECONDS - value: "12" - volumeMounts: - - name: probe-script - mountPath: /scripts - readOnly: true - volumes: - - name: probe-script - configMap: - name: platform-quality-suite-probe-script - defaultMode: 0555