Compare commits

..

6 Commits

9 changed files with 125 additions and 63 deletions

View File

@ -423,16 +423,17 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600" "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
) )
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}' TEST_REPO_SELECTOR = 'repo=~"ariadne|metis"'
ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}' TEST_CI_COVERAGE = f'ariadne_ci_coverage_percent{{{TEST_REPO_SELECTOR}}}'
ARIADNE_TEST_SUCCESS_RATE = ( TEST_CI_TESTS = f'ariadne_ci_tests_total{{{TEST_REPO_SELECTOR}}}'
TEST_SUCCESS_RATE = (
"100 * " "100 * "
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) ' f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result="passed"}}[30d])) '
"/ clamp_min(" "/ clamp_min("
'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)' f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"passed|failed|error"}}[30d])), 1)'
) )
ARIADNE_TEST_FAILURES_24H = ( TEST_FAILURES_24H = (
'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))' f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))'
) )
POSTGRES_CONN_USED = ( POSTGRES_CONN_USED = (
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") ' 'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
@ -1297,8 +1298,8 @@ def build_overview():
panels.append( panels.append(
timeseries_panel( timeseries_panel(
42, 42,
"Ariadne Test Success Rate", "Ariadne + Metis Test Success Rate",
ARIADNE_TEST_SUCCESS_RATE, TEST_SUCCESS_RATE,
{"h": 6, "w": 6, "x": 12, "y": 14}, {"h": 6, "w": 6, "x": 12, "y": 14},
unit="percent", unit="percent",
max_value=100, max_value=100,
@ -1309,8 +1310,8 @@ def build_overview():
panels.append( panels.append(
bargauge_panel( bargauge_panel(
43, 43,
"Tests with Failures (24h)", "Ariadne + Metis Tests with Failures (24h)",
ARIADNE_TEST_FAILURES_24H, TEST_FAILURES_24H,
{"h": 6, "w": 6, "x": 18, "y": 14}, {"h": 6, "w": 6, "x": 18, "y": 14},
unit="none", unit="none",
instant=True, instant=True,
@ -2656,8 +2657,8 @@ def build_jobs_dashboard():
panels.append( panels.append(
stat_panel( stat_panel(
17, 17,
"Ariadne CI Coverage (%)", "Ariadne + Metis CI Coverage (%)",
ARIADNE_CI_COVERAGE, TEST_CI_COVERAGE,
{"h": 6, "w": 4, "x": 8, "y": 11}, {"h": 6, "w": 4, "x": 8, "y": 11},
unit="percent", unit="percent",
decimals=1, decimals=1,
@ -2668,8 +2669,8 @@ def build_jobs_dashboard():
panels.append( panels.append(
table_panel( table_panel(
18, 18,
"Ariadne CI Tests (latest)", "Ariadne + Metis CI Tests (latest)",
ARIADNE_CI_TESTS, TEST_CI_TESTS,
{"h": 6, "w": 12, "x": 12, "y": 11}, {"h": 6, "w": 12, "x": 12, "y": 11},
unit="none", unit="none",
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],

View File

@ -10,6 +10,8 @@ spec:
app: node-image-sweeper app: node-image-sweeper
updateStrategy: updateStrategy:
type: RollingUpdate type: RollingUpdate
rollingUpdate:
maxUnavailable: 100%
template: template:
metadata: metadata:
labels: labels:
@ -29,6 +31,21 @@ spec:
- name: node-image-sweeper - name: node-image-sweeper
image: python:3.12.9-alpine3.20 image: python:3.12.9-alpine3.20
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"] command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
env:
- name: SWEEP_INTERVAL_SEC
value: "21600"
- name: HIGH_USAGE_PERCENT
value: "70"
- name: EMERGENCY_USAGE_PERCENT
value: "80"
- name: BASE_THRESHOLD_DAYS
value: "14"
- name: HIGH_USAGE_THRESHOLD_DAYS
value: "3"
- name: LOG_RETENTION_DAYS
value: "7"
- name: JOURNAL_MAX_SIZE
value: "200M"
securityContext: securityContext:
privileged: true privileged: true
runAsUser: 0 runAsUser: 0

View File

@ -2,26 +2,39 @@
set -eu set -eu
ONE_SHOT=${ONE_SHOT:-false} ONE_SHOT=${ONE_SHOT:-false}
THRESHOLD_DAYS=14 SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600}
BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage="" sweep_once() {
if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
THRESHOLD_DAYS=3 threshold_days="${BASE_THRESHOLD_DAYS}"
fi if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then
threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
fi
cutoff=$(python3 - <<'PY' cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY'
import time, os import os
print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400) import time
days = int(os.environ.get("THRESHOLD_DAYS", "14"))
print(int(time.time()) - days * 86400)
PY PY
) )
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ') RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}') IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause" prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
import json
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY' import os
import json, os, sys, time import sys
import time
try: try:
data = json.load(sys.stdin) data = json.load(sys.stdin)
@ -74,19 +87,33 @@ for p in prune:
PY PY
) )
if [ -n "${prune_list}" ]; then if [ -n "${prune_list}" ]; then
printf "%s" "${prune_list}" | while read -r image_id; do printf "%s" "${prune_list}" | while read -r image_id; do
if [ -n "${image_id}" ]; then if [ -n "${image_id}" ]; then
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
fi fi
done done
fi fi
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
# Emergency pass for rootfs pressure on SD-backed nodes.
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
fi
}
sweep_once
if [ "${ONE_SHOT}" = "true" ]; then if [ "${ONE_SHOT}" = "true" ]; then
exit 0 exit 0
fi fi
sleep infinity while true; do
sleep "${SWEEP_INTERVAL_SEC}"
sweep_once
done

View File

@ -1125,7 +1125,7 @@
{ {
"id": 17, "id": 17,
"type": "stat", "type": "stat",
"title": "Ariadne CI Coverage (%)", "title": "Ariadne + Metis CI Coverage (%)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1138,7 +1138,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}", "expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
"refId": "A", "refId": "A",
"legendFormat": "{{branch}}", "legendFormat": "{{branch}}",
"instant": true "instant": true
@ -1188,7 +1188,7 @@
{ {
"id": 18, "id": 18,
"type": "table", "type": "table",
"title": "Ariadne CI Tests (latest)", "title": "Ariadne + Metis CI Tests (latest)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1201,7 +1201,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}", "expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }

View File

@ -1677,7 +1677,7 @@
{ {
"id": 42, "id": 42,
"type": "timeseries", "type": "timeseries",
"title": "Ariadne Test Success Rate", "title": "Ariadne + Metis Test Success Rate",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1690,7 +1690,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)", "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
"refId": "A" "refId": "A"
} }
], ],
@ -1714,7 +1714,7 @@
{ {
"id": 43, "id": 43,
"type": "bargauge", "type": "bargauge",
"title": "Tests with Failures (24h)", "title": "Ariadne + Metis Tests with Failures (24h)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1727,7 +1727,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))", "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
"refId": "A", "refId": "A",
"legendFormat": "{{result}}", "legendFormat": "{{result}}",
"instant": true "instant": true

View File

@ -22,7 +22,24 @@ data:
- orgId: 1 - orgId: 1
receiver: email-admins receiver: email-admins
group_by: group_by:
- grafana_folder
- alertname - alertname
group_wait: 1m
group_interval: 30m
repeat_interval: 12h
routes:
- receiver: email-admins
object_matchers:
- [severity, "=", "critical"]
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
- receiver: email-admins
object_matchers:
- [severity, "=", "warning"]
group_wait: 5m
group_interval: 2h
repeat_interval: 24h
rules.yaml: | rules.yaml: |
apiVersion: 1 apiVersion: 1
groups: groups:
@ -32,7 +49,7 @@ data:
interval: 1m interval: 1m
rules: rules:
- uid: disk-pressure-root - uid: disk-pressure-root
title: "Node rootfs high (>80%)" title: "Node rootfs high (>85%)"
condition: C condition: C
for: "10m" for: "10m"
data: data:
@ -66,7 +83,7 @@ data:
type: threshold type: threshold
conditions: conditions:
- evaluator: - evaluator:
params: [80] params: [85]
type: gt type: gt
operator: operator:
type: and type: and
@ -76,7 +93,7 @@ data:
noDataState: NoData noDataState: NoData
execErrState: Error execErrState: Error
annotations: annotations:
summary: "{{ $labels.node }} rootfs >80% for 10m" summary: "{{ $labels.node }} rootfs >85% for 10m"
labels: labels:
severity: warning severity: warning
- uid: disk-growth-1h - uid: disk-growth-1h
@ -501,7 +518,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: postmark_outbound_bounce_rate{window="1d"} expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0)
legendFormat: bounce 1d legendFormat: bounce 1d
datasource: datasource:
type: prometheus type: prometheus
@ -530,7 +547,7 @@ data:
reducer: reducer:
type: last type: last
type: query type: query
noDataState: NoData noDataState: OK
execErrState: Error execErrState: Error
annotations: annotations:
summary: "Postmark 1d bounce rate >5%" summary: "Postmark 1d bounce rate >5%"
@ -549,7 +566,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: min_over_time(max by (instance) (postmark_api_up)[5m]) expr: max(postmark_api_up) or on() vector(0)
legendFormat: api up legendFormat: api up
datasource: datasource:
type: prometheus type: prometheus
@ -578,7 +595,7 @@ data:
reducer: reducer:
type: last type: last
type: query type: query
noDataState: NoData noDataState: OK
execErrState: Error execErrState: Error
annotations: annotations:
summary: "Postmark exporter reports API down" summary: "Postmark exporter reports API down"

View File

@ -1134,7 +1134,7 @@ data:
{ {
"id": 17, "id": 17,
"type": "stat", "type": "stat",
"title": "Ariadne CI Coverage (%)", "title": "Ariadne + Metis CI Coverage (%)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1147,7 +1147,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}", "expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}",
"refId": "A", "refId": "A",
"legendFormat": "{{branch}}", "legendFormat": "{{branch}}",
"instant": true "instant": true
@ -1197,7 +1197,7 @@ data:
{ {
"id": 18, "id": 18,
"type": "table", "type": "table",
"title": "Ariadne CI Tests (latest)", "title": "Ariadne + Metis CI Tests (latest)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1210,7 +1210,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "ariadne_ci_tests_total{repo=\"ariadne\"}", "expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }

View File

@ -1686,7 +1686,7 @@ data:
{ {
"id": 42, "id": 42,
"type": "timeseries", "type": "timeseries",
"title": "Ariadne Test Success Rate", "title": "Ariadne + Metis Test Success Rate",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1699,7 +1699,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)", "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)",
"refId": "A" "refId": "A"
} }
], ],
@ -1723,7 +1723,7 @@ data:
{ {
"id": 43, "id": 43,
"type": "bargauge", "type": "bargauge",
"title": "Tests with Failures (24h)", "title": "Ariadne + Metis Tests with Failures (24h)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1736,7 +1736,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))", "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))",
"refId": "A", "refId": "A",
"legendFormat": "{{result}}", "legendFormat": "{{result}}",
"instant": true "instant": true

View File

@ -286,7 +286,7 @@ spec:
podAnnotations: podAnnotations:
vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "monitoring" vault.hashicorp.com/role: "monitoring"
monitoring.bstein.dev/restart-rev: "4" monitoring.bstein.dev/restart-rev: "6"
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env.sh: | vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }} {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}