ariadne: remove remaining cronjobs and migrate schedule ownership
This commit is contained in:
parent
166020ca1d
commit
64b4f14018
@ -36,3 +36,17 @@ ariadne_schedule_tasks:
|
||||
- task: schedule.comms_seed_room
|
||||
check_last_success: true
|
||||
max_success_age_hours: 48
|
||||
- task: schedule.pod_cleaner
|
||||
check_last_success: true
|
||||
max_success_age_hours: 6
|
||||
- task: schedule.opensearch_prune
|
||||
check_last_success: false
|
||||
- task: schedule.image_sweeper
|
||||
check_last_success: true
|
||||
max_success_age_hours: 18
|
||||
- task: schedule.metis_k3s_token_sync
|
||||
check_last_success: true
|
||||
max_success_age_hours: 12
|
||||
- task: schedule.platform_quality_suite_probe
|
||||
check_last_success: true
|
||||
max_success_age_hours: 2
|
||||
|
||||
@ -400,6 +400,11 @@ ARIADNE_ALL_SCHEDULE_TASKS = [
|
||||
"schedule.comms_pin_invite",
|
||||
"schedule.comms_reset_room",
|
||||
"schedule.comms_seed_room",
|
||||
"schedule.pod_cleaner",
|
||||
"schedule.opensearch_prune",
|
||||
"schedule.image_sweeper",
|
||||
"schedule.metis_k3s_token_sync",
|
||||
"schedule.platform_quality_suite_probe",
|
||||
]
|
||||
ARIADNE_FAST_SCHEDULE_TASKS = [
|
||||
task
|
||||
@ -414,6 +419,10 @@ ARIADNE_SCHEDULE_HEALTH_TASKS = [
|
||||
"schedule.firefly_user_sync",
|
||||
"schedule.comms_guest_name",
|
||||
"schedule.comms_seed_room",
|
||||
"schedule.pod_cleaner",
|
||||
"schedule.image_sweeper",
|
||||
"schedule.metis_k3s_token_sync",
|
||||
"schedule.platform_quality_suite_probe",
|
||||
]
|
||||
ARIADNE_ALL_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_ALL_SCHEDULE_TASKS)})$"'
|
||||
ARIADNE_FAST_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_FAST_SCHEDULE_TASKS)})$"'
|
||||
|
||||
@ -18,7 +18,6 @@ resources:
|
||||
- oneoffs/opensearch-ism-job.yaml
|
||||
- oneoffs/opensearch-dashboards-setup-job.yaml
|
||||
- oneoffs/opensearch-observability-setup-job.yaml
|
||||
- opensearch-prune-cronjob.yaml
|
||||
- fluent-bit-helmrelease.yaml
|
||||
- node-log-rotation-daemonset.yaml
|
||||
- node-image-gc-rpi4-daemonset.yaml
|
||||
@ -46,12 +45,6 @@ configMapGenerator:
|
||||
- node_image_prune_rpi5.sh=scripts/node_image_prune_rpi5.sh
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: opensearch-prune-script
|
||||
namespace: logging
|
||||
files:
|
||||
- prune.py=scripts/opensearch_prune.py
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: opensearch-observability-script
|
||||
namespace: logging
|
||||
files:
|
||||
|
||||
@ -1,48 +0,0 @@
|
||||
# services/logging/opensearch-prune-cronjob.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: opensearch-prune
|
||||
namespace: logging
|
||||
spec:
|
||||
schedule: "23 3 * * *"
|
||||
suspend: true
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 2
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
hardware: rpi5
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: hardware
|
||||
operator: In
|
||||
values:
|
||||
- rpi5
|
||||
containers:
|
||||
- name: prune
|
||||
image: python:3.11-alpine
|
||||
command: ["python", "/scripts/prune.py"]
|
||||
env:
|
||||
- name: OPENSEARCH_URL
|
||||
value: http://opensearch-master.logging.svc.cluster.local:9200
|
||||
- name: LOG_LIMIT_BYTES
|
||||
value: "1099511627776"
|
||||
- name: LOG_INDEX_PATTERNS
|
||||
value: "kube-*,journald-*,trace-analytics-*"
|
||||
volumeMounts:
|
||||
- name: scripts
|
||||
mountPath: /scripts
|
||||
volumes:
|
||||
- name: scripts
|
||||
configMap:
|
||||
name: opensearch-prune-script
|
||||
@ -1,77 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
os_url = os.environ.get("OPENSEARCH_URL", "http://opensearch-master.logging.svc.cluster.local:9200").rstrip("/")
|
||||
limit_bytes = int(os.environ.get("LOG_LIMIT_BYTES", str(1024**4)))
|
||||
patterns = [p.strip() for p in os.environ.get("LOG_INDEX_PATTERNS", "kube-*,journald-*").split(",") if p.strip()]
|
||||
|
||||
UNITS = {
|
||||
"b": 1,
|
||||
"kb": 1024,
|
||||
"mb": 1024**2,
|
||||
"gb": 1024**3,
|
||||
"tb": 1024**4,
|
||||
}
|
||||
|
||||
def parse_size(value: str) -> int:
|
||||
if not value:
|
||||
return 0
|
||||
text = value.strip().lower()
|
||||
if text in ("-", "0"):
|
||||
return 0
|
||||
match = re.match(r"^([0-9.]+)([a-z]+)$", text)
|
||||
if not match:
|
||||
return 0
|
||||
number = float(match.group(1))
|
||||
unit = match.group(2)
|
||||
if unit not in UNITS:
|
||||
return 0
|
||||
return int(number * UNITS[unit])
|
||||
|
||||
def request_json(path: str):
|
||||
url = f"{os_url}{path}"
|
||||
with urllib.request.urlopen(url, timeout=30) as response:
|
||||
payload = response.read().decode("utf-8")
|
||||
return json.loads(payload)
|
||||
|
||||
def delete_index(index: str) -> None:
|
||||
url = f"{os_url}/{index}"
|
||||
req = urllib.request.Request(url, method="DELETE")
|
||||
with urllib.request.urlopen(req, timeout=30) as response:
|
||||
_ = response.read()
|
||||
print(f"deleted {index}")
|
||||
|
||||
indices = []
|
||||
for pattern in patterns:
|
||||
try:
|
||||
data = request_json(f"/_cat/indices/{pattern}?format=json&h=index,store.size,creation.date")
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code == 404:
|
||||
continue
|
||||
raise
|
||||
for item in data:
|
||||
index = item.get("index")
|
||||
if not index or index.startswith("."):
|
||||
continue
|
||||
size = parse_size(item.get("store.size", ""))
|
||||
created = int(item.get("creation.date", "0") or 0)
|
||||
indices.append({"index": index, "size": size, "created": created})
|
||||
|
||||
total = sum(item["size"] for item in indices)
|
||||
print(f"total_log_bytes={total}")
|
||||
if total <= limit_bytes:
|
||||
print("within limit")
|
||||
sys.exit(0)
|
||||
|
||||
indices.sort(key=lambda item: item["created"])
|
||||
for item in indices:
|
||||
if total <= limit_bytes:
|
||||
break
|
||||
delete_index(item["index"])
|
||||
total -= item["size"]
|
||||
|
||||
print(f"remaining_log_bytes={total}")
|
||||
@ -345,6 +345,10 @@ spec:
|
||||
value: "15"
|
||||
- name: ARIADNE_SCHEDULE_METIS_SENTINEL_WATCH
|
||||
value: "*/30 * * * *"
|
||||
- name: ARIADNE_SCHEDULE_METIS_K3S_TOKEN_SYNC
|
||||
value: "11 */6 * * *"
|
||||
- name: ARIADNE_SCHEDULE_PLATFORM_QUALITY_SUITE_PROBE
|
||||
value: "*/15 * * * *"
|
||||
- name: METRICS_PATH
|
||||
value: "/metrics"
|
||||
resources:
|
||||
|
||||
@ -1,53 +0,0 @@
|
||||
# services/maintenance/image-sweeper-cronjob.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: image-sweeper
|
||||
namespace: maintenance
|
||||
spec:
|
||||
schedule: "30 4 * * 0"
|
||||
suspend: true
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 2
|
||||
failedJobsHistoryLimit: 2
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
serviceAccountName: node-image-sweeper
|
||||
restartPolicy: OnFailure
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
kubernetes.io/arch: arm64
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
- key: node-role.kubernetes.io/master
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- name: image-sweeper
|
||||
image: python:3.12.9-alpine3.20
|
||||
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
|
||||
env:
|
||||
- name: ONE_SHOT
|
||||
value: "true"
|
||||
securityContext:
|
||||
privileged: true
|
||||
runAsUser: 0
|
||||
volumeMounts:
|
||||
- name: host-root
|
||||
mountPath: /host
|
||||
- name: script
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: host-root
|
||||
hostPath:
|
||||
path: /
|
||||
- name: script
|
||||
configMap:
|
||||
name: node-image-sweeper-script
|
||||
defaultMode: 0555
|
||||
@ -7,10 +7,13 @@ resources:
|
||||
- secretproviderclass.yaml
|
||||
- metis-configmap.yaml
|
||||
- metis-data-pvc.yaml
|
||||
- soteria-configmap.yaml
|
||||
- vault-serviceaccount.yaml
|
||||
- vault-sync-deployment.yaml
|
||||
- ariadne-serviceaccount.yaml
|
||||
- soteria-serviceaccount.yaml
|
||||
- ariadne-rbac.yaml
|
||||
- soteria-rbac.yaml
|
||||
- disable-k3s-traefik-serviceaccount.yaml
|
||||
- disable-k3s-traefik-rbac.yaml
|
||||
- k3s-traefik-cleanup-rbac.yaml
|
||||
@ -21,19 +24,18 @@ resources:
|
||||
- pod-cleaner-rbac.yaml
|
||||
- ariadne-deployment.yaml
|
||||
- metis-deployment.yaml
|
||||
- soteria-deployment.yaml
|
||||
- oneoffs/ariadne-migrate-job.yaml
|
||||
- ariadne-service.yaml
|
||||
- soteria-service.yaml
|
||||
- disable-k3s-traefik-daemonset.yaml
|
||||
- oneoffs/k3s-traefik-cleanup-job.yaml
|
||||
- node-nofile-daemonset.yaml
|
||||
- metis-sentinel-amd64-daemonset.yaml
|
||||
- metis-sentinel-arm64-daemonset.yaml
|
||||
- metis-k3s-token-sync-cronjob.yaml
|
||||
- k3s-agent-restart-daemonset.yaml
|
||||
- pod-cleaner-cronjob.yaml
|
||||
- node-image-sweeper-serviceaccount.yaml
|
||||
- node-image-sweeper-daemonset.yaml
|
||||
- image-sweeper-cronjob.yaml
|
||||
- metis-service.yaml
|
||||
- oauth2-proxy-metis.yaml
|
||||
- metis-certificate.yaml
|
||||
@ -43,6 +45,8 @@ images:
|
||||
newTag: 0.1.0-22 # {"$imagepolicy": "maintenance:ariadne:tag"}
|
||||
- name: registry.bstein.dev/bstein/metis
|
||||
newTag: 0.1.0-9-amd64
|
||||
- name: registry.bstein.dev/bstein/soteria
|
||||
newTag: 0.1.0-11 # {"$imagepolicy": "maintenance:soteria:tag"}
|
||||
configMapGenerator:
|
||||
- name: disable-k3s-traefik-script
|
||||
namespace: maintenance
|
||||
@ -62,12 +66,6 @@ configMapGenerator:
|
||||
- node_nofile.sh=scripts/node_nofile.sh
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: pod-cleaner-script
|
||||
namespace: maintenance
|
||||
files:
|
||||
- pod_cleaner.sh=scripts/pod_cleaner.sh
|
||||
options:
|
||||
disableNameSuffixHash: true
|
||||
- name: node-image-sweeper-script
|
||||
namespace: maintenance
|
||||
files:
|
||||
|
||||
@ -1,55 +0,0 @@
|
||||
# services/maintenance/metis-k3s-token-sync-cronjob.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: metis-k3s-token-sync
|
||||
namespace: maintenance
|
||||
spec:
|
||||
schedule: "11 */6 * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 2
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
serviceAccountName: metis-token-sync
|
||||
restartPolicy: OnFailure
|
||||
nodeName: titan-0a
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
- key: node-role.kubernetes.io/master
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- name: sync
|
||||
image: hashicorp/vault:1.17.6
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
args:
|
||||
- |
|
||||
set -eu
|
||||
token="$(tr -d '\n' < /host/var/lib/rancher/k3s/server/token)"
|
||||
jwt="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
|
||||
VAULT_TOKEN="$(vault write -field=token auth/kubernetes/login role="${VAULT_K8S_ROLE}" jwt="${jwt}")"
|
||||
export VAULT_TOKEN
|
||||
vault kv put kv/atlas/maintenance/metis-runtime k3s_token="${token}"
|
||||
env:
|
||||
- name: VAULT_ADDR
|
||||
value: http://vault.vault.svc.cluster.local:8200
|
||||
- name: VAULT_K8S_ROLE
|
||||
value: maintenance-metis-token-sync
|
||||
securityContext:
|
||||
runAsUser: 0
|
||||
volumeMounts:
|
||||
- name: k3s-server
|
||||
mountPath: /host/var/lib/rancher/k3s/server
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: k3s-server
|
||||
hostPath:
|
||||
path: /var/lib/rancher/k3s/server
|
||||
@ -1,36 +0,0 @@
|
||||
# services/maintenance/pod-cleaner-cronjob.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: pod-cleaner
|
||||
namespace: maintenance
|
||||
spec:
|
||||
schedule: "0 * * * *"
|
||||
suspend: true
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 1
|
||||
template:
|
||||
spec:
|
||||
serviceAccountName: pod-cleaner
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
kubernetes.io/arch: arm64
|
||||
node-role.kubernetes.io/worker: "true"
|
||||
containers:
|
||||
- name: cleaner
|
||||
image: bitnami/kubectl@sha256:554ab88b1858e8424c55de37ad417b16f2a0e65d1607aa0f3fe3ce9b9f10b131
|
||||
command: ["/usr/bin/env", "bash"]
|
||||
args: ["/scripts/pod_cleaner.sh"]
|
||||
volumeMounts:
|
||||
- name: script
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: script
|
||||
configMap:
|
||||
name: pod-cleaner-script
|
||||
defaultMode: 0555
|
||||
@ -1,12 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
for phase in Succeeded Failed; do
|
||||
kubectl get pods -A --field-selector="status.phase=${phase}" \
|
||||
-o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' \
|
||||
| while read -r namespace name; do
|
||||
if [ -n "${namespace}" ] && [ -n "${name}" ]; then
|
||||
kubectl delete pod -n "${namespace}" "${name}" --ignore-not-found --grace-period=0 --wait=false
|
||||
fi
|
||||
done
|
||||
done
|
||||
@ -248,7 +248,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum((((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"})) > bool 129600)) or on() vector(0)",
|
||||
"expr": "sum((((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"})) > bool 129600)) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -316,7 +316,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"})) or on() vector(0)",
|
||||
"expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"})) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -376,7 +376,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"}) > bool 0)) or on() vector(0)",
|
||||
"expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}) > bool 0)) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -616,7 +616,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"}[$__range])) / 3600)",
|
||||
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{task}}",
|
||||
"instant": true
|
||||
@ -691,7 +691,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"}[$__range])) / 3600)",
|
||||
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{task}}",
|
||||
"instant": true
|
||||
@ -766,7 +766,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_seed_room)$\"}[$__range])) / 3600)",
|
||||
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{task}}",
|
||||
"instant": true
|
||||
@ -841,7 +841,7 @@
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc(((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"} - time()) / 3600))",
|
||||
"expr": "sort_desc(((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"} - time()) / 3600))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{task}}",
|
||||
"instant": true
|
||||
|
||||
@ -257,7 +257,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum((((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"})) > bool 129600)) or on() vector(0)",
|
||||
"expr": "sum((((time() - ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"})) > bool 129600)) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -325,7 +325,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"})) or on() vector(0)",
|
||||
"expr": "count((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"} unless on(task) ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"})) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -385,7 +385,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room)$\"}) > bool 0)) or on() vector(0)",
|
||||
"expr": "sum(((1 - ariadne_schedule_last_status{task=~\"^(schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.firefly_user_sync|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}) > bool 0)) or on() vector(0)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -625,7 +625,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"}[$__range])) / 3600)",
|
||||
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{task}}",
|
||||
"instant": true
|
||||
@ -700,7 +700,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"}[$__range])) / 3600)",
|
||||
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{task}}",
|
||||
"instant": true
|
||||
@ -775,7 +775,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_seed_room)$\"}[$__range])) / 3600)",
|
||||
"expr": "sort_desc((time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"}[$__range])) / 3600)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{task}}",
|
||||
"instant": true
|
||||
@ -850,7 +850,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc(((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room)$\"} - time()) / 3600))",
|
||||
"expr": "sort_desc(((ariadne_schedule_next_run_timestamp_seconds{task=~\"^(schedule.mailu_sync|schedule.nextcloud_sync|schedule.nextcloud_cron|schedule.nextcloud_maintenance|schedule.vaultwarden_sync|schedule.wger_user_sync|schedule.wger_admin|schedule.firefly_user_sync|schedule.firefly_cron|schedule.vault_k8s_auth|schedule.vault_oidc|schedule.comms_guest_name|schedule.comms_pin_invite|schedule.comms_reset_room|schedule.comms_seed_room|schedule.pod_cleaner|schedule.opensearch_prune|schedule.image_sweeper|schedule.metis_k3s_token_sync|schedule.platform_quality_suite_probe)$\"} - time()) / 3600))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{task}}",
|
||||
"instant": true
|
||||
|
||||
@ -23,7 +23,6 @@ resources:
|
||||
- platform-quality-gateway-pvc.yaml
|
||||
- platform-quality-gateway-service.yaml
|
||||
- platform-quality-gateway-deployment.yaml
|
||||
- platform-quality-suite-probe-cronjob.yaml
|
||||
- vault-sync-deployment.yaml
|
||||
- grafana-alerting-config.yaml
|
||||
- grafana-folders.yaml
|
||||
|
||||
@ -1,39 +0,0 @@
|
||||
# services/monitoring/platform-quality-suite-probe-cronjob.yaml
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: platform-quality-suite-probe
|
||||
namespace: monitoring
|
||||
spec:
|
||||
schedule: "*/15 * * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 2
|
||||
failedJobsHistoryLimit: 2
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: platform-quality-suite-probe
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
containers:
|
||||
- name: probe
|
||||
image: curlimages/curl:8.12.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
command: ["/bin/sh", "/scripts/platform_quality_suite_probe.sh"]
|
||||
env:
|
||||
- name: PUSHGATEWAY_URL
|
||||
value: http://platform-quality-gateway.monitoring.svc.cluster.local:9091
|
||||
- name: HTTP_TIMEOUT_SECONDS
|
||||
value: "12"
|
||||
volumeMounts:
|
||||
- name: probe-script
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: probe-script
|
||||
configMap:
|
||||
name: platform-quality-suite-probe-script
|
||||
defaultMode: 0555
|
||||
Loading…
x
Reference in New Issue
Block a user