Compare commits

..

No commits in common. "6c3c1342cdc09e1ef7c9e3609022372e7e22972f" and "e93aa6e33b7544dd71eec1e7ad92ca1ce0f36161" have entirely different histories.

9 changed files with 63 additions and 125 deletions

View File

@ -423,17 +423,16 @@ ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
"(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600" "(time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds[$__range])) / 3600"
) )
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
TEST_REPO_SELECTOR = 'repo=~"ariadne|metis"' ARIADNE_CI_COVERAGE = 'ariadne_ci_coverage_percent{repo="ariadne"}'
TEST_CI_COVERAGE = f'ariadne_ci_coverage_percent{{{TEST_REPO_SELECTOR}}}' ARIADNE_CI_TESTS = 'ariadne_ci_tests_total{repo="ariadne"}'
TEST_CI_TESTS = f'ariadne_ci_tests_total{{{TEST_REPO_SELECTOR}}}' ARIADNE_TEST_SUCCESS_RATE = (
TEST_SUCCESS_RATE = (
"100 * " "100 * "
f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result="passed"}}[30d])) ' 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result="passed"}[30d])) '
"/ clamp_min(" "/ clamp_min("
f'sum(max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"passed|failed|error"}}[30d])), 1)' 'sum(max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"passed|failed|error"}[30d])), 1)'
) )
TEST_FAILURES_24H = ( ARIADNE_TEST_FAILURES_24H = (
f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))' 'sum by (result) (max_over_time(ariadne_ci_tests_total{repo="ariadne",result=~"failed|error"}[24h]))'
) )
POSTGRES_CONN_USED = ( POSTGRES_CONN_USED = (
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") ' 'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
@ -1298,8 +1297,8 @@ def build_overview():
panels.append( panels.append(
timeseries_panel( timeseries_panel(
42, 42,
"Ariadne + Metis Test Success Rate", "Ariadne Test Success Rate",
TEST_SUCCESS_RATE, ARIADNE_TEST_SUCCESS_RATE,
{"h": 6, "w": 6, "x": 12, "y": 14}, {"h": 6, "w": 6, "x": 12, "y": 14},
unit="percent", unit="percent",
max_value=100, max_value=100,
@ -1310,8 +1309,8 @@ def build_overview():
panels.append( panels.append(
bargauge_panel( bargauge_panel(
43, 43,
"Ariadne + Metis Tests with Failures (24h)", "Tests with Failures (24h)",
TEST_FAILURES_24H, ARIADNE_TEST_FAILURES_24H,
{"h": 6, "w": 6, "x": 18, "y": 14}, {"h": 6, "w": 6, "x": 18, "y": 14},
unit="none", unit="none",
instant=True, instant=True,
@ -2657,8 +2656,8 @@ def build_jobs_dashboard():
panels.append( panels.append(
stat_panel( stat_panel(
17, 17,
"Ariadne + Metis CI Coverage (%)", "Ariadne CI Coverage (%)",
TEST_CI_COVERAGE, ARIADNE_CI_COVERAGE,
{"h": 6, "w": 4, "x": 8, "y": 11}, {"h": 6, "w": 4, "x": 8, "y": 11},
unit="percent", unit="percent",
decimals=1, decimals=1,
@ -2669,8 +2668,8 @@ def build_jobs_dashboard():
panels.append( panels.append(
table_panel( table_panel(
18, 18,
"Ariadne + Metis CI Tests (latest)", "Ariadne CI Tests (latest)",
TEST_CI_TESTS, ARIADNE_CI_TESTS,
{"h": 6, "w": 12, "x": 12, "y": 11}, {"h": 6, "w": 12, "x": 12, "y": 11},
unit="none", unit="none",
transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}], transformations=[{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}],

View File

@ -10,8 +10,6 @@ spec:
app: node-image-sweeper app: node-image-sweeper
updateStrategy: updateStrategy:
type: RollingUpdate type: RollingUpdate
rollingUpdate:
maxUnavailable: 100%
template: template:
metadata: metadata:
labels: labels:
@ -31,21 +29,6 @@ spec:
- name: node-image-sweeper - name: node-image-sweeper
image: python:3.12.9-alpine3.20 image: python:3.12.9-alpine3.20
command: ["/bin/sh", "/scripts/node_image_sweeper.sh"] command: ["/bin/sh", "/scripts/node_image_sweeper.sh"]
env:
- name: SWEEP_INTERVAL_SEC
value: "21600"
- name: HIGH_USAGE_PERCENT
value: "70"
- name: EMERGENCY_USAGE_PERCENT
value: "80"
- name: BASE_THRESHOLD_DAYS
value: "14"
- name: HIGH_USAGE_THRESHOLD_DAYS
value: "3"
- name: LOG_RETENTION_DAYS
value: "7"
- name: JOURNAL_MAX_SIZE
value: "200M"
securityContext: securityContext:
privileged: true privileged: true
runAsUser: 0 runAsUser: 0

View File

@ -2,39 +2,26 @@
set -eu set -eu
ONE_SHOT=${ONE_SHOT:-false} ONE_SHOT=${ONE_SHOT:-false}
SWEEP_INTERVAL_SEC=${SWEEP_INTERVAL_SEC:-21600} THRESHOLD_DAYS=14
BASE_THRESHOLD_DAYS=${BASE_THRESHOLD_DAYS:-14}
HIGH_USAGE_THRESHOLD_DAYS=${HIGH_USAGE_THRESHOLD_DAYS:-3}
HIGH_USAGE_PERCENT=${HIGH_USAGE_PERCENT:-70}
EMERGENCY_USAGE_PERCENT=${EMERGENCY_USAGE_PERCENT:-85}
LOG_RETENTION_DAYS=${LOG_RETENTION_DAYS:-7}
JOURNAL_MAX_SIZE=${JOURNAL_MAX_SIZE:-200M}
SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
sweep_once() { usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage=""
usage=$(df -P /host | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || usage="" if [ -n "${usage}" ] && [ "${usage}" -ge 70 ]; then
threshold_days="${BASE_THRESHOLD_DAYS}" THRESHOLD_DAYS=3
if [ -n "${usage}" ] && [ "${usage}" -ge "${HIGH_USAGE_PERCENT}" ]; then fi
threshold_days="${HIGH_USAGE_THRESHOLD_DAYS}"
fi
cutoff=$(THRESHOLD_DAYS="${threshold_days}" python3 - <<'PY' cutoff=$(python3 - <<'PY'
import os import time, os
import time print(int(time.time()) - int(os.environ.get("THRESHOLD_DAYS", "14")) * 86400)
days = int(os.environ.get("THRESHOLD_DAYS", "14"))
print(int(time.time()) - days * 86400)
PY PY
) )
RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ') RUNNING=$(chroot /host /bin/sh -c "crictl ps -a --quiet 2>/dev/null" | tr -s ' ' '\n' | sort -u | tr '\n' ' ')
IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}') IMAGES_JSON=$(chroot /host /bin/sh -c "crictl images -o json 2>/dev/null" || echo '{}')
prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY' SKIP="registry.k8s.io/pause k8s.gcr.io/pause rancher/mirrored-pause"
import json
import os prune_list=$(printf "%s" "${IMAGES_JSON}" | CUTOFF="${cutoff}" RUNNING="${RUNNING}" SKIP="${SKIP}" python3 - <<'PY'
import sys import json, os, sys, time
import time
try: try:
data = json.load(sys.stdin) data = json.load(sys.stdin)
@ -87,33 +74,19 @@ for p in prune:
PY PY
) )
if [ -n "${prune_list}" ]; then if [ -n "${prune_list}" ]; then
printf "%s" "${prune_list}" | while read -r image_id; do printf "%s" "${prune_list}" | while read -r image_id; do
if [ -n "${image_id}" ]; then if [ -n "${image_id}" ]; then
chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true chroot /host /bin/sh -c "crictl rmi --prune ${image_id}" || true
fi fi
done done
fi fi
find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true find /host/var/lib/rancher/k3s/agent/images -type f -name "*.tar" -mtime +7 -print -delete 2>/dev/null || true
find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true find /host/var/lib/rancher/k3s/agent/containerd -maxdepth 1 -type f -mtime +7 -print -delete 2>/dev/null || true
if [ -n "${usage}" ] && [ "${usage}" -ge "${EMERGENCY_USAGE_PERCENT}" ]; then
# Emergency pass for rootfs pressure on SD-backed nodes.
chroot /host /bin/sh -c "journalctl --vacuum-size='${JOURNAL_MAX_SIZE}' >/dev/null 2>&1 || true"
find /host/var/log -type f -name "*.gz" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
find /host/var/log/pods -type f -name "*.log" -mtime +"${LOG_RETENTION_DAYS}" -print -delete 2>/dev/null || true
chroot /host /bin/sh -c "if command -v apt-get >/dev/null 2>&1; then apt-get clean >/dev/null 2>&1 || true; fi"
fi
}
sweep_once
if [ "${ONE_SHOT}" = "true" ]; then if [ "${ONE_SHOT}" = "true" ]; then
exit 0 exit 0
fi fi
while true; do sleep infinity
sleep "${SWEEP_INTERVAL_SEC}"
sweep_once
done

View File

@ -1125,7 +1125,7 @@
{ {
"id": 17, "id": 17,
"type": "stat", "type": "stat",
"title": "Ariadne + Metis CI Coverage (%)", "title": "Ariadne CI Coverage (%)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1138,7 +1138,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}", "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
"refId": "A", "refId": "A",
"legendFormat": "{{branch}}", "legendFormat": "{{branch}}",
"instant": true "instant": true
@ -1188,7 +1188,7 @@
{ {
"id": 18, "id": 18,
"type": "table", "type": "table",
"title": "Ariadne + Metis CI Tests (latest)", "title": "Ariadne CI Tests (latest)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1201,7 +1201,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}", "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }

View File

@ -1677,7 +1677,7 @@
{ {
"id": 42, "id": 42,
"type": "timeseries", "type": "timeseries",
"title": "Ariadne + Metis Test Success Rate", "title": "Ariadne Test Success Rate",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1690,7 +1690,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)", "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
"refId": "A" "refId": "A"
} }
], ],
@ -1714,7 +1714,7 @@
{ {
"id": 43, "id": 43,
"type": "bargauge", "type": "bargauge",
"title": "Ariadne + Metis Tests with Failures (24h)", "title": "Tests with Failures (24h)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1727,7 +1727,7 @@
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))", "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
"refId": "A", "refId": "A",
"legendFormat": "{{result}}", "legendFormat": "{{result}}",
"instant": true "instant": true

View File

@ -22,24 +22,7 @@ data:
- orgId: 1 - orgId: 1
receiver: email-admins receiver: email-admins
group_by: group_by:
- grafana_folder
- alertname - alertname
group_wait: 1m
group_interval: 30m
repeat_interval: 12h
routes:
- receiver: email-admins
object_matchers:
- [severity, "=", "critical"]
group_wait: 30s
group_interval: 5m
repeat_interval: 2h
- receiver: email-admins
object_matchers:
- [severity, "=", "warning"]
group_wait: 5m
group_interval: 2h
repeat_interval: 24h
rules.yaml: | rules.yaml: |
apiVersion: 1 apiVersion: 1
groups: groups:
@ -49,7 +32,7 @@ data:
interval: 1m interval: 1m
rules: rules:
- uid: disk-pressure-root - uid: disk-pressure-root
title: "Node rootfs high (>85%)" title: "Node rootfs high (>80%)"
condition: C condition: C
for: "10m" for: "10m"
data: data:
@ -83,7 +66,7 @@ data:
type: threshold type: threshold
conditions: conditions:
- evaluator: - evaluator:
params: [85] params: [80]
type: gt type: gt
operator: operator:
type: and type: and
@ -93,7 +76,7 @@ data:
noDataState: NoData noDataState: NoData
execErrState: Error execErrState: Error
annotations: annotations:
summary: "{{ $labels.node }} rootfs >85% for 10m" summary: "{{ $labels.node }} rootfs >80% for 10m"
labels: labels:
severity: warning severity: warning
- uid: disk-growth-1h - uid: disk-growth-1h
@ -518,7 +501,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: max(postmark_outbound_bounce_rate{window="1d"}) or on() vector(0) expr: postmark_outbound_bounce_rate{window="1d"}
legendFormat: bounce 1d legendFormat: bounce 1d
datasource: datasource:
type: prometheus type: prometheus
@ -547,7 +530,7 @@ data:
reducer: reducer:
type: last type: last
type: query type: query
noDataState: OK noDataState: NoData
execErrState: Error execErrState: Error
annotations: annotations:
summary: "Postmark 1d bounce rate >5%" summary: "Postmark 1d bounce rate >5%"
@ -566,7 +549,7 @@ data:
model: model:
intervalMs: 60000 intervalMs: 60000
maxDataPoints: 43200 maxDataPoints: 43200
expr: max(postmark_api_up) or on() vector(0) expr: min_over_time(max by (instance) (postmark_api_up)[5m])
legendFormat: api up legendFormat: api up
datasource: datasource:
type: prometheus type: prometheus
@ -595,7 +578,7 @@ data:
reducer: reducer:
type: last type: last
type: query type: query
noDataState: OK noDataState: NoData
execErrState: Error execErrState: Error
annotations: annotations:
summary: "Postmark exporter reports API down" summary: "Postmark exporter reports API down"

View File

@ -1134,7 +1134,7 @@ data:
{ {
"id": 17, "id": 17,
"type": "stat", "type": "stat",
"title": "Ariadne + Metis CI Coverage (%)", "title": "Ariadne CI Coverage (%)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1147,7 +1147,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "ariadne_ci_coverage_percent{repo=~\"ariadne|metis\"}", "expr": "ariadne_ci_coverage_percent{repo=\"ariadne\"}",
"refId": "A", "refId": "A",
"legendFormat": "{{branch}}", "legendFormat": "{{branch}}",
"instant": true "instant": true
@ -1197,7 +1197,7 @@ data:
{ {
"id": 18, "id": 18,
"type": "table", "type": "table",
"title": "Ariadne + Metis CI Tests (latest)", "title": "Ariadne CI Tests (latest)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1210,7 +1210,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "ariadne_ci_tests_total{repo=~\"ariadne|metis\"}", "expr": "ariadne_ci_tests_total{repo=\"ariadne\"}",
"refId": "A", "refId": "A",
"instant": true "instant": true
} }

View File

@ -1686,7 +1686,7 @@ data:
{ {
"id": 42, "id": 42,
"type": "timeseries", "type": "timeseries",
"title": "Ariadne + Metis Test Success Rate", "title": "Ariadne Test Success Rate",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1699,7 +1699,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)", "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"passed|failed|error\"}[30d])), 1)",
"refId": "A" "refId": "A"
} }
], ],
@ -1723,7 +1723,7 @@ data:
{ {
"id": 43, "id": 43,
"type": "bargauge", "type": "bargauge",
"title": "Ariadne + Metis Tests with Failures (24h)", "title": "Tests with Failures (24h)",
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
"uid": "atlas-vm" "uid": "atlas-vm"
@ -1736,7 +1736,7 @@ data:
}, },
"targets": [ "targets": [
{ {
"expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))", "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=\"ariadne\",result=~\"failed|error\"}[24h])))",
"refId": "A", "refId": "A",
"legendFormat": "{{result}}", "legendFormat": "{{result}}",
"instant": true "instant": true

View File

@ -286,7 +286,7 @@ spec:
podAnnotations: podAnnotations:
vault.hashicorp.com/agent-inject: "true" vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "monitoring" vault.hashicorp.com/role: "monitoring"
monitoring.bstein.dev/restart-rev: "6" monitoring.bstein.dev/restart-rev: "4"
vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin" vault.hashicorp.com/agent-inject-secret-grafana-env.sh: "kv/data/atlas/monitoring/grafana-admin"
vault.hashicorp.com/agent-inject-template-grafana-env.sh: | vault.hashicorp.com/agent-inject-template-grafana-env.sh: |
{{ with secret "kv/data/atlas/monitoring/grafana-admin" }} {{ with secret "kv/data/atlas/monitoring/grafana-admin" }}