ariadne: remove remaining cronjobs and migrate schedule ownership

This commit is contained in:
Brad Stein 2026-04-10 22:40:58 -03:00
parent 166020ca1d
commit 64b4f14018
15 changed files with 48 additions and 351 deletions

View File

@ -36,3 +36,17 @@ ariadne_schedule_tasks:
- task: schedule.comms_seed_room - task: schedule.comms_seed_room
check_last_success: true check_last_success: true
max_success_age_hours: 48 max_success_age_hours: 48
- task: schedule.pod_cleaner
check_last_success: true
max_success_age_hours: 6
- task: schedule.opensearch_prune
check_last_success: false
- task: schedule.image_sweeper
check_last_success: true
max_success_age_hours: 18
- task: schedule.metis_k3s_token_sync
check_last_success: true
max_success_age_hours: 12
- task: schedule.platform_quality_suite_probe
check_last_success: true
max_success_age_hours: 2

View File

@ -400,6 +400,11 @@ ARIADNE_ALL_SCHEDULE_TASKS = [
"schedule.comms_pin_invite", "schedule.comms_pin_invite",
"schedule.comms_reset_room", "schedule.comms_reset_room",
"schedule.comms_seed_room", "schedule.comms_seed_room",
"schedule.pod_cleaner",
"schedule.opensearch_prune",
"schedule.image_sweeper",
"schedule.metis_k3s_token_sync",
"schedule.platform_quality_suite_probe",
] ]
ARIADNE_FAST_SCHEDULE_TASKS = [ ARIADNE_FAST_SCHEDULE_TASKS = [
task task
@ -414,6 +419,10 @@ ARIADNE_SCHEDULE_HEALTH_TASKS = [
"schedule.firefly_user_sync", "schedule.firefly_user_sync",
"schedule.comms_guest_name", "schedule.comms_guest_name",
"schedule.comms_seed_room", "schedule.comms_seed_room",
"schedule.pod_cleaner",
"schedule.image_sweeper",
"schedule.metis_k3s_token_sync",
"schedule.platform_quality_suite_probe",
] ]
ARIADNE_ALL_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_ALL_SCHEDULE_TASKS)})$"' ARIADNE_ALL_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_ALL_SCHEDULE_TASKS)})$"'
ARIADNE_FAST_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_FAST_SCHEDULE_TASKS)})$"' ARIADNE_FAST_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_FAST_SCHEDULE_TASKS)})$"'

View File

@ -18,7 +18,6 @@ resources:
- oneoffs/opensearch-ism-job.yaml - oneoffs/opensearch-ism-job.yaml
- oneoffs/opensearch-dashboards-setup-job.yaml - oneoffs/opensearch-dashboards-setup-job.yaml
- oneoffs/opensearch-observability-setup-job.yaml - oneoffs/opensearch-observability-setup-job.yaml
- opensearch-prune-cronjob.yaml
- fluent-bit-helmrelease.yaml - fluent-bit-helmrelease.yaml
- node-log-rotation-daemonset.yaml - node-log-rotation-daemonset.yaml
- node-image-gc-rpi4-daemonset.yaml - node-image-gc-rpi4-daemonset.yaml
@ -46,12 +45,6 @@ configMapGenerator:
- node_image_prune_rpi5.sh=scripts/node_image_prune_rpi5.sh - node_image_prune_rpi5.sh=scripts/node_image_prune_rpi5.sh
options: options:
disableNameSuffixHash: true disableNameSuffixHash: true
- name: opensearch-prune-script
namespace: logging
files:
- prune.py=scripts/opensearch_prune.py
options:
disableNameSuffixHash: true
- name: opensearch-observability-script - name: opensearch-observability-script
namespace: logging namespace: logging
files: files:

View File

@ -1,48 +0,0 @@
# services/logging/opensearch-prune-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: opensearch-prune
namespace: logging
spec:
schedule: "23 3 * * *"
suspend: true
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 2
template:
spec:
restartPolicy: OnFailure
nodeSelector:
node-role.kubernetes.io/worker: "true"
hardware: rpi5
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: hardware
operator: In
values:
- rpi5
containers:
- name: prune
image: python:3.11-alpine
command: ["python", "/scripts/prune.py"]
env:
- name: OPENSEARCH_URL
value: http://opensearch-master.logging.svc.cluster.local:9200
- name: LOG_LIMIT_BYTES
value: "1099511627776"
- name: LOG_INDEX_PATTERNS
value: "kube-*,journald-*,trace-analytics-*"
volumeMounts:
- name: scripts
mountPath: /scripts
volumes:
- name: scripts
configMap:
name: opensearch-prune-script

View File

@ -1,77 +0,0 @@
import json
import os
import re
import sys
import urllib.error
import urllib.request
os_url = os.environ.get("OPENSEARCH_URL", "http://opensearch-master.logging.svc.cluster.local:9200").rstrip("/")
limit_bytes = int(os.environ.get("LOG_LIMIT_BYTES", str(1024**4)))
patterns = [p.strip() for p in os.environ.get("LOG_INDEX_PATTERNS", "kube-*,journald-*").split(",") if p.strip()]
UNITS = {
"b": 1,
"kb": 1024,
"mb": 1024**2,
"gb": 1024**3,
"tb": 1024**4,
}
def parse_size(value: str) -> int:
if not value:
return 0
text = value.strip().lower()
if text in ("-", "0"):
return 0
match = re.match(r"^([0-9.]+)([a-z]+)$", text)
if not match:
return 0
number = float(match.group(1))
unit = match.group(2)
if unit not in UNITS:
return 0
return int(number * UNITS[unit])
def request_json(path: str):
url = f"{os_url}{path}"
with urllib.request.urlopen(url, timeout=30) as response:
payload = response.read().decode("utf-8")
return json.loads(payload)
def delete_index(index: str) -> None:
url = f"{os_url}/{index}"
req = urllib.request.Request(url, method="DELETE")
with urllib.request.urlopen(req, timeout=30) as response:
_ = response.read()
print(f"deleted {index}")
indices = []
for pattern in patterns:
try:
data = request_json(f"/_cat/indices/{pattern}?format=json&h=index,store.size,creation.date")
except urllib.error.HTTPError as exc:
if exc.code == 404:
continue
raise
for item in data:
index = item.get("index")
if not index or index.startswith("."):
continue
size = parse_size(item.get("store.size", ""))
created = int(item.get("creation.date", "0") or 0)
indices.append({"index": index, "size": size, "created": created})
total = sum(item["size"] for item in indices)
print(f"total_log_bytes={total}")
if total <= limit_bytes:
print("within limit")
sys.exit(0)
indices.sort(key=lambda item: item["created"])
for item in indices:
if total <= limit_bytes:
break
delete_index(item["index"])
total -= item["size"]
print(f"remaining_log_bytes={total}")

View File

@ -345,6 +345,10 @@ spec:
value: "15" value: "15"
- name: ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH - name: ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH
value: "*/30 * * * *" value: "*/30 * * * *"
- name: ARIADNE_SCHEDULE_METIS_K3S_TOKEN_SYNC
value: "11 */6 * * *"
- name: ARIADNE_SCHEDULE_PLATFORM_QUALITY_SUITE_PROBE
value: "*/15 * * * *"
- name: METRICS_PATH - name: METRICS_PATH
value: "/metrics" value: "/metrics"
resources: resources:

View File

@ -1,53 +0,0 @@
# services/maintenance/image-sweeper-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: image-sweeper
namespace: maintenance
spec:
schedule: "30 4 * * 0"
suspend: true
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 2
failedJobsHistoryLimit: 2
jobTemplate:
spec:
template:
spec:
serviceAccountName: node-image-sweeper
restartPolicy: OnFailure
nodeSelector:
kubernetes.io/os: linux
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
containers:
- name: image-sweeper
image: python:3.12.9-alpine3.20
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
env:
- name: ONE_SHOT
value: "true"
securityContext:
privileged: true
runAsUser: 0
volumeMounts:
- name: host-root
mountPath: /host
- name: script
mountPath: /scripts
readOnly: true
volumes:
- name: host-root
hostPath:
path: /
- name: script
configMap:
name: node-image-sweeper-script
defaultMode: 0555

View File

@ -7,10 +7,13 @@ resources:
- secretproviderclass.yaml - secretproviderclass.yaml
- metis-configmap.yaml - metis-configmap.yaml
- metis-data-pvc.yaml - metis-data-pvc.yaml
- soteria-configmap.yaml
- vault-serviceaccount.yaml - vault-serviceaccount.yaml
- vault-sync-deployment.yaml - vault-sync-deployment.yaml
- ariadne-serviceaccount.yaml - ariadne-serviceaccount.yaml
- soteria-serviceaccount.yaml
- ariadne-rbac.yaml - ariadne-rbac.yaml
- soteria-rbac.yaml
- disable-k3s-traefik-serviceaccount.yaml - disable-k3s-traefik-serviceaccount.yaml
- disable-k3s-traefik-rbac.yaml - disable-k3s-traefik-rbac.yaml
- k3s-traefik-cleanup-rbac.yaml - k3s-traefik-cleanup-rbac.yaml
@ -21,19 +24,18 @@ resources:
- pod-cleaner-rbac.yaml - pod-cleaner-rbac.yaml
- ariadne-deployment.yaml - ariadne-deployment.yaml
- metis-deployment.yaml - metis-deployment.yaml
- soteria-deployment.yaml
- oneoffs/ariadne-migrate-job.yaml - oneoffs/ariadne-migrate-job.yaml
- ariadne-service.yaml - ariadne-service.yaml
- soteria-service.yaml
- disable-k3s-traefik-daemonset.yaml - disable-k3s-traefik-daemonset.yaml
- oneoffs/k3s-traefik-cleanup-job.yaml - oneoffs/k3s-traefik-cleanup-job.yaml
- node-nofile-daemonset.yaml - node-nofile-daemonset.yaml
- metis-sentinel-amd64-daemonset.yaml - metis-sentinel-amd64-daemonset.yaml
- metis-sentinel-arm64-daemonset.yaml - metis-sentinel-arm64-daemonset.yaml
- metis-k3s-token-sync-cronjob.yaml
- k3s-agent-restart-daemonset.yaml - k3s-agent-restart-daemonset.yaml
- pod-cleaner-cronjob.yaml
- node-image-sweeper-serviceaccount.yaml - node-image-sweeper-serviceaccount.yaml
- node-image-sweeper-daemonset.yaml - node-image-sweeper-daemonset.yaml
- image-sweeper-cronjob.yaml
- metis-service.yaml - metis-service.yaml
- oauth2-proxy-metis.yaml - oauth2-proxy-metis.yaml
- metis-certificate.yaml - metis-certificate.yaml
@ -43,6 +45,8 @@ images:
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"} newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
- name: registry.bstein.dev/bstein/metis - name: registry.bstein.dev/bstein/metis
newTag: 0.1.0-9-amd64 newTag: 0.1.0-9-amd64
- name: registry.bstein.dev/bstein/soteria
newTag: 0.1.0-11 # {"$imagepolicy": "maintenance:soteria:tag"}
configMapGenerator: configMapGenerator:
- name: disable-k3s-traefik-script - name: disable-k3s-traefik-script
namespace: maintenance namespace: maintenance
@ -62,12 +66,6 @@ configMapGenerator:
- node_nofile.sh=scripts/node_nofile.sh - node_nofile.sh=scripts/node_nofile.sh
options: options:
disableNameSuffixHash: true disableNameSuffixHash: true
- name: pod-cleaner-script
namespace: maintenance
files:
- pod_cleaner.sh=scripts/pod_cleaner.sh
options:
disableNameSuffixHash: true
- name: node-image-sweeper-script - name: node-image-sweeper-script
namespace: maintenance namespace: maintenance
files: files:

View File

@ -1,55 +0,0 @@
# services/maintenance/metis-k3s-token-sync-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: metis-k3s-token-sync
namespace: maintenance
spec:
schedule: "11 */6 * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 2
jobTemplate:
spec:
template:
spec:
serviceAccountName: metis-token-sync
restartPolicy: OnFailure
nodeName: titan-0a
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
containers:
- name: sync
image: hashicorp/vault:1.17.6
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
args:
- |
set -eu
token="$(tr -d '\n' < /host/var/lib/rancher/k3s/server/token)"
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
VAULT_TOKEN="$(vault write -field=token auth/kubernetes/login role="${VAULT_K8S_ROLE}" jwt="${jwt}")"
export VAULT_TOKEN
vault kv put kv/atlas/maintenance/metis-runtime k3s_token="${token}"
env:
- name: VAULT_ADDR
value: http://vault.vault.svc.cluster.local:8200
- name: VAULT_K8S_ROLE
value: maintenance-metis-token-sync
securityContext:
runAsUser: 0
volumeMounts:
- name: k3s-server
mountPath: /host/var/lib/rancher/k3s/server
readOnly: true
volumes:
- name: k3s-server
hostPath:
path: /var/lib/rancher/k3s/server

View File

@ -1,36 +0,0 @@
# services/maintenance/pod-cleaner-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: pod-cleaner
namespace: maintenance
spec:
schedule: "0 * * * *"
suspend: true
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 1
template:
spec:
serviceAccountName: pod-cleaner
restartPolicy: Never
nodeSelector:
kubernetes.io/arch: arm64
node-role.kubernetes.io/worker: "true"
containers:
- name: cleaner
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
command: ["/usr/bin/env", "bash"]
args: ["/scripts/pod_cleaner.sh"]
volumeMounts:
- name: script
mountPath: /scripts
readOnly: true
volumes:
- name: script
configMap:
name: pod-cleaner-script
defaultMode: 0555

View File

@ -1,12 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
for phase in Succeeded Failed; do
kubectl get pods -A --field-selector="status.phase=${phase}" \
-o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' \
| while read -r namespace name; do
if [ -n "${namespace}" ] && [ -n "${name}" ]; then
kubectl delete pod -n "${namespace}" "${name}" --ignore-not-found --grace-period=0 --wait=false
fi
done
done

View File

@ -248,7 +248,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum((((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"})) > bool 129600)) or on() vector(0)", "expr": "sum((((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"})) > bool 129600)) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -316,7 +316,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"})) or on() vector(0)", "expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"})) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -376,7 +376,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"}) > bool 0)) or on() vector(0)", "expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}) > bool 0)) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -616,7 +616,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"}[$__range])) / 3600)", "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)",
"refId": "A", "refId": "A",
"legendFormat": "{{task}}", "legendFormat": "{{task}}",
"instant": true "instant": true
@ -691,7 +691,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"}[$__range])) / 3600)", "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)",
"refId": "A", "refId": "A",
"legendFormat": "{{task}}", "legendFormat": "{{task}}",
"instant": true "instant": true
@ -766,7 +766,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_seed_room)$\"}[$__range])) / 3600)", "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)",
"refId": "A", "refId": "A",
"legendFormat": "{{task}}", "legendFormat": "{{task}}",
"instant": true "instant": true
@ -841,7 +841,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"} - time()) / 3600))", "expr": "sort_desc(((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"} - time()) / 3600))",
"refId": "A", "refId": "A",
"legendFormat": "{{task}}", "legendFormat": "{{task}}",
"instant": true "instant": true

View File

@ -257,7 +257,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum((((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"})) > bool 129600)) or on() vector(0)", "expr": "sum((((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"})) > bool 129600)) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -325,7 +325,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"})) or on() vector(0)", "expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"})) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -385,7 +385,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"}) > bool 0)) or on() vector(0)", "expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}) > bool 0)) or on() vector(0)",
"refId": "A" "refId": "A"
} }
], ],
@ -625,7 +625,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"}[$__range])) / 3600)", "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)",
"refId": "A", "refId": "A",
"legendFormat": "{{task}}", "legendFormat": "{{task}}",
"instant": true "instant": true
@ -700,7 +700,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"}[$__range])) / 3600)", "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)",
"refId": "A", "refId": "A",
"legendFormat": "{{task}}", "legendFormat": "{{task}}",
"instant": true "instant": true
@ -775,7 +775,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_seed_room)$\"}[$__range])) / 3600)", "expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)",
"refId": "A", "refId": "A",
"legendFormat": "{{task}}", "legendFormat": "{{task}}",
"instant": true "instant": true
@ -850,7 +850,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"} - time()) / 3600))", "expr": "sort_desc(((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"} - time()) / 3600))",
"refId": "A", "refId": "A",
"legendFormat": "{{task}}", "legendFormat": "{{task}}",
"instant": true "instant": true

View File

@ -23,7 +23,6 @@ resources:
- platform-quality-gateway-pvc.yaml - platform-quality-gateway-pvc.yaml
- platform-quality-gateway-service.yaml - platform-quality-gateway-service.yaml
- platform-quality-gateway-deployment.yaml - platform-quality-gateway-deployment.yaml
- platform-quality-suite-probe-cronjob.yaml
- vault-sync-deployment.yaml - vault-sync-deployment.yaml
- grafana-alerting-config.yaml - grafana-alerting-config.yaml
- grafana-folders.yaml - grafana-folders.yaml

View File

@ -1,39 +0,0 @@
# services/monitoring/platform-quality-suite-probe-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: platform-quality-suite-probe
namespace: monitoring
spec:
schedule: "*/15 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 2
failedJobsHistoryLimit: 2
jobTemplate:
spec:
backoffLimit: 0
template:
metadata:
labels:
app: platform-quality-suite-probe
spec:
restartPolicy: Never
containers:
- name: probe
image: curlimages/curl:8.12.1
imagePullPolicy: IfNotPresent
command: ["/bin/sh", "/scripts/platform_quality_suite_probe.sh"]
env:
- name: PUSHGATEWAY_URL
value: http://platform-quality-gateway.monitoring.svc.cluster.local:9091
- name: HTTP_TIMEOUT_SECONDS
value: "12"
volumeMounts:
- name: probe-script
mountPath: /scripts
readOnly: true
volumes:
- name: probe-script
configMap:
name: platform-quality-suite-probe-script
defaultMode: 0555