titan-iac/scripts/dashboards_render_atlas.py

4107 lines
139 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
2025-11-17 16:27:38 -03:00
"""Generate Atlas Grafana dashboards and render them into ConfigMaps.
Usage:
scripts/dashboards_render_atlas.py --build # rebuild JSON + ConfigMaps
scripts/dashboards_render_atlas.py # re-render ConfigMaps from JSON
"""
2025-11-17 16:27:38 -03:00
import argparse
import json
import textwrap
import urllib.parse
from pathlib import Path
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Paths, folders, and shared metadata
# ---------------------------------------------------------------------------
ROOT = Path(__file__).resolve().parents[1]
DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards"
CONFIG_TEMPLATE = textwrap.dedent(
"""# {relative_path}
apiVersion: v1
kind: ConfigMap
metadata:
name: {name}
labels:
grafana_dashboard: "1"
data:
{key}: |
{payload}
"""
)
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
PUBLIC_FOLDER = "overview"
2025-11-17 16:27:38 -03:00
PRIVATE_FOLDER = "atlas-internal"
ASTRAIOS_MOUNTPOINT = "/mnt/astraios"
2025-11-17 16:27:38 -03:00
PERCENT_THRESHOLDS = {
"mode": "absolute",
2025-11-17 16:27:38 -03:00
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 91.5},
2025-11-17 16:27:38 -03:00
],
}
NAMESPACE_CPU_WINDOW = "1m"
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Cluster metadata
# ---------------------------------------------------------------------------
CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"]
CONTROL_DEPENDENCIES = ["titan-db", "titan-jh"]
2025-11-17 16:27:38 -03:00
CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
WORKER_NODES = [
"titan-04",
"titan-05",
"titan-06",
"titan-07",
"titan-08",
"titan-09",
"titan-10",
"titan-11",
"titan-20",
"titan-21",
2025-11-17 16:27:38 -03:00
"titan-12",
"titan-13",
"titan-14",
"titan-15",
"titan-16",
2025-11-17 16:27:38 -03:00
"titan-17",
"titan-18",
"titan-19",
"titan-22",
"titan-24",
]
CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES)
CONTROL_ALL_REGEX = "|".join(CONTROL_ALL)
WORKER_REGEX = "|".join(WORKER_NODES)
CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
WORKER_TOTAL = len(WORKER_NODES)
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
# Namespaces considered infrastructure (excluded from workload counts)
INFRA_PATTERNS = [
"kube-.*",
".*-system",
"traefik",
"monitoring",
"logging",
"cert-manager",
"maintenance",
"postgres",
]
INFRA_REGEX = f"^({'|'.join(INFRA_PATTERNS)})$"
# Namespaces allowed on control plane without counting as workloads
CP_ALLOWED_NS = INFRA_REGEX
2025-11-17 18:55:11 -03:00
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
CONTROL_WORKLOADS_EXPR = (
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
)
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# PromQL helpers
# ---------------------------------------------------------------------------
NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")'
def node_filter(regex):
"""Return a selector that evaluates to 1 for nodes matching the regex."""
return (
f'label_replace(node_uname_info{{nodename=~"{regex}"}}, '
'"node", "$1", "nodename", "(.*)")'
)
2025-11-17 16:27:38 -03:00
def scoped_node_expr(base, scope=""):
"""Attach nodename metadata and optionally filter to a scope regex."""
expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})"
if scope:
expr = f"({expr}) * on(node) group_left() {node_filter(scope)}"
return expr
def node_cpu_expr(scope=""):
idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))'
base = f"(1 - {idle}) * 100"
return scoped_node_expr(base, scope)
def node_mem_expr(scope=""):
usage = (
"avg by (instance) ("
"(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) "
"/ node_memory_MemTotal_bytes * 100)"
)
return scoped_node_expr(usage, scope)
def filesystem_usage_expr(mount, scope=""):
base = (
f'avg by (instance) ('
f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} '
f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)'
)
return scoped_node_expr(base, scope)
def root_usage_expr(scope=""):
return filesystem_usage_expr("/", scope)
def astraios_usage_expr(scope=""):
return filesystem_usage_expr(ASTRAIOS_MOUNTPOINT, scope)
2025-11-17 16:27:38 -03:00
def astreae_usage_expr(mount):
return (
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
)
def astreae_free_expr(mount):
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
2025-11-17 20:19:20 -03:00
def topk_with_node(expr):
2025-11-17 23:42:55 -03:00
return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")'
2025-11-17 20:19:20 -03:00
def node_net_expr(scope=""):
base = (
'sum by (instance) ('
2025-11-17 21:20:19 -03:00
'rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))'
)
return scoped_node_expr(base, scope)
def node_io_expr(scope=""):
base = (
"sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
"+ rate(node_disk_written_bytes_total[5m]))"
)
return scoped_node_expr(base, scope)
def namespace_selector(scope_var):
return f'namespace!="",pod!="",container!="",container!="POD",{scope_var}'
def namespace_gpu_selector(scope_var):
return f'namespace!="",pod!="",{scope_var}'
def namespace_cpu_raw(scope_var):
return (
"sum(rate(container_cpu_usage_seconds_total"
f"{{{namespace_selector(scope_var)}}}[{NAMESPACE_CPU_WINDOW}])) by (namespace)"
)
def namespace_ram_raw(scope_var):
return f"sum(container_memory_working_set_bytes{{{namespace_selector(scope_var)}}}) by (namespace)"
def namespace_gpu_usage_instant(scope_var):
2026-01-27 21:43:37 -03:00
return gpu_usage_by_namespace(scope_var)
2026-01-26 22:26:24 -03:00
def jetson_gpu_util_by_node():
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
2026-01-27 21:43:37 -03:00
def dcgm_gpu_util_by_node():
dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")'
dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")'
2026-01-27 16:19:30 -03:00
return (
2026-01-27 21:43:37 -03:00
"avg by (node) ("
f"{dcgm_ns} * on(namespace,pod) group_left(node) "
'kube_pod_info{namespace="monitoring"}'
")"
2026-01-27 16:19:30 -03:00
)
2026-01-27 21:43:37 -03:00
def gpu_util_by_node():
return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}"
def gpu_util_by_hostname():
return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")'
def gpu_node_labels():
return 'kube_node_labels{label_accelerator=~".+"} or kube_node_labels{label_jetson="true"}'
def gpu_requests_by_namespace_node(scope_var):
2026-01-26 22:26:24 -03:00
return (
"sum by (namespace,node) ("
f'kube_pod_container_resource_requests{{resource=~"nvidia.com/gpu.*",{scope_var}}} '
"* on(namespace,pod) group_left(node) kube_pod_info "
2026-01-27 21:46:58 -03:00
f"* on(node) group_left() ({gpu_node_labels()})"
2026-01-27 21:43:37 -03:00
")"
)
def gpu_usage_by_namespace(scope_var):
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
total_by_node = f"sum by (node) ({requests_by_ns})"
return (
"sum by (namespace) ("
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
2026-01-27 21:46:58 -03:00
f"* on(node) group_left() ({gpu_util_by_node()})"
2026-01-26 22:26:24 -03:00
")"
)
def jetson_gpu_usage_by_namespace(scope_var):
requests_by_ns = jetson_gpu_requests(scope_var)
total_by_node = f"sum by (node) ({requests_by_ns})"
return (
"sum by (namespace) ("
f"({requests_by_ns}) / clamp_min({total_by_node}, 1) "
f"* on(node) group_left() {jetson_gpu_util_by_node()}"
")"
)
def namespace_share_expr(resource_expr):
total = f"clamp_min(sum( {resource_expr} ), 1)"
return f"100 * ( {resource_expr} ) / {total}"
def namespace_cpu_share_expr(scope_var):
return namespace_share_expr(namespace_cpu_raw(scope_var))
def namespace_ram_share_expr(scope_var):
return namespace_share_expr(namespace_ram_raw(scope_var))
def namespace_gpu_share_expr(scope_var):
usage = namespace_gpu_usage_instant(scope_var)
total = f"(sum({usage}) or on() vector(0))"
share = f"100 * ({usage}) / clamp_min({total}, 1)"
idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)"
return f"({share}) or ({idle})"
2025-11-17 23:12:16 -03:00
PROBLEM_PODS_EXPR = (
'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) '
"or on() vector(0)"
)
2025-11-17 16:27:38 -03:00
CRASHLOOP_EXPR = (
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
'{reason=~"CrashLoopBackOff|ImagePullBackOff"})) '
"or on() vector(0)"
2025-11-17 16:27:38 -03:00
)
STUCK_TERMINATING_EXPR = (
2025-11-17 18:55:11 -03:00
'sum(max by (namespace,pod) ('
'((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
')) '
"or on() vector(0)"
2025-11-17 16:27:38 -03:00
)
2025-12-19 13:46:34 -03:00
UPTIME_WINDOW = "365d"
# Keep the subquery step coarse so we don't request an excessive number of points.
UPTIME_STEP = "1h"
TRAEFIK_READY_EXPR = (
"("
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
" / clamp_min("
'sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)'
")"
)
CONTROL_READY_FRACTION_EXPR = (
f"(sum(kube_node_status_condition{{condition=\"Ready\",status=\"true\",node=~\"{CONTROL_REGEX}\"}})"
f" / {CONTROL_TOTAL})"
)
UPTIME_AVAIL_EXPR = (
f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))"
)
# Tie-breaker to deterministically pick one node per namespace when shares tie.
NODE_TIEBREAKER = " + ".join(
f"({node_filter(node)}) * 1e-6 * {idx}"
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
)
UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:{UPTIME_STEP}])"
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
UPTIME_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 2},
{"color": "yellow", "value": 3},
{"color": "green", "value": 3.5},
],
}
UPTIME_PERCENT_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 0.99},
{"color": "yellow", "value": 0.999},
{"color": "green", "value": 0.9999},
{"color": "blue", "value": 0.99999},
],
}
2025-11-17 16:27:38 -03:00
PROBLEM_TABLE_EXPR = (
"(time() - kube_pod_created{pod!=\"\"}) "
"* on(namespace,pod) group_left(node) kube_pod_info "
"* on(namespace,pod) group_left(phase) "
"max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
)
CRASHLOOP_TABLE_EXPR = (
"(time() - kube_pod_created{pod!=\"\"}) "
"* on(namespace,pod) group_left(node) kube_pod_info "
"* on(namespace,pod,container) group_left(reason) "
"max by (namespace,pod,container,reason) "
"(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
)
STUCK_TABLE_EXPR = (
2025-11-17 18:55:11 -03:00
"("
2025-11-17 16:27:38 -03:00
"((time() - kube_pod_deletion_timestamp{pod!=\"\"}) "
2025-11-17 18:55:11 -03:00
"and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) "
"* on(namespace,pod) group_left(node) kube_pod_info"
")"
2025-11-17 16:27:38 -03:00
)
NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
def promql_task_regex(tasks):
"""Return a PromQL-safe regex alternation for the provided task names."""
return "|".join(tasks)
ARIADNE_ALL_SCHEDULE_TASKS = [
"schedule.mailu_sync",
"schedule.nextcloud_sync",
"schedule.nextcloud_cron",
"schedule.nextcloud_maintenance",
"schedule.vaultwarden_sync",
"schedule.wger_user_sync",
"schedule.wger_admin",
"schedule.firefly_user_sync",
"schedule.firefly_cron",
"schedule.vault_k8s_auth",
"schedule.vault_oidc",
"schedule.comms_guest_name",
"schedule.comms_pin_invite",
"schedule.comms_reset_room",
"schedule.comms_seed_room",
"schedule.pod_cleaner",
"schedule.opensearch_prune",
"schedule.image_sweeper",
"schedule.metis_k3s_token_sync",
"schedule.platform_quality_suite_probe",
]
ARIADNE_FAST_SCHEDULE_TASKS = [
task
for task in ARIADNE_ALL_SCHEDULE_TASKS
if task not in {"schedule.comms_pin_invite", "schedule.comms_reset_room"}
]
ARIADNE_SCHEDULE_HEALTH_TASKS = [
"schedule.nextcloud_sync",
"schedule.nextcloud_cron",
"schedule.vaultwarden_sync",
"schedule.wger_user_sync",
"schedule.firefly_user_sync",
"schedule.comms_guest_name",
"schedule.comms_seed_room",
"schedule.pod_cleaner",
"schedule.image_sweeper",
"schedule.metis_k3s_token_sync",
"schedule.platform_quality_suite_probe",
]
ARIADNE_ALL_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_ALL_SCHEDULE_TASKS)})$"'
ARIADNE_FAST_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_FAST_SCHEDULE_TASKS)})$"'
ARIADNE_SCHEDULE_HEALTH_FILTER = f'task=~"^({promql_task_regex(ARIADNE_SCHEDULE_HEALTH_TASKS)})$"'
ARIADNE_ALL_SCHEDULE_NEXT_RUN = f"ariadne_schedule_next_run_timestamp_seconds{{{ARIADNE_ALL_SCHEDULE_FILTER}}}"
ARIADNE_ALL_SCHEDULE_LAST_SUCCESS = (
f"ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_ALL_SCHEDULE_FILTER}}}"
)
ARIADNE_ALL_SCHEDULE_LAST_ERROR = f"ariadne_schedule_last_error_timestamp_seconds{{{ARIADNE_ALL_SCHEDULE_FILTER}}}"
ARIADNE_ALL_SCHEDULE_LAST_STATUS = f"ariadne_schedule_last_status{{{ARIADNE_ALL_SCHEDULE_FILTER}}}"
ARIADNE_FAST_SCHEDULE_LAST_SUCCESS = (
f"ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_FAST_SCHEDULE_FILTER}}}"
)
ARIADNE_FAST_SCHEDULE_LAST_ERROR = f"ariadne_schedule_last_error_timestamp_seconds{{{ARIADNE_FAST_SCHEDULE_FILTER}}}"
ARIADNE_FAST_SCHEDULE_LAST_STATUS = f"ariadne_schedule_last_status{{{ARIADNE_FAST_SCHEDULE_FILTER}}}"
ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS = (
f"ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_SCHEDULE_HEALTH_FILTER}}}"
)
ARIADNE_HEALTH_SCHEDULE_LAST_STATUS = f"ariadne_schedule_last_status{{{ARIADNE_SCHEDULE_HEALTH_FILTER}}}"
ARIADNE_SCHEDULE_LAST_SUCCESS_AGE = f"(time() - {ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS})"
ARIADNE_SCHEDULE_LAST_ERROR_AGE = f"(time() - {ARIADNE_ALL_SCHEDULE_LAST_ERROR})"
ARIADNE_SCHEDULE_LAST_SUCCESS_AGE_HOURS = f"({ARIADNE_SCHEDULE_LAST_SUCCESS_AGE}) / 3600"
ARIADNE_SCHEDULE_LAST_ERROR_AGE_HOURS = f"({ARIADNE_SCHEDULE_LAST_ERROR_AGE}) / 3600"
ARIADNE_SCHEDULE_STALE_WINDOW_SEC = 36 * 3600
ARIADNE_SCHEDULE_STALE = f"(({ARIADNE_SCHEDULE_LAST_SUCCESS_AGE}) > bool {ARIADNE_SCHEDULE_STALE_WINDOW_SEC})"
ARIADNE_SCHEDULE_MISSING = (
f"({ARIADNE_ALL_SCHEDULE_NEXT_RUN} unless on(task) {ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS})"
)
ARIADNE_SCHEDULE_FAILED = f"((1 - {ARIADNE_HEALTH_SCHEDULE_LAST_STATUS}) > bool 0)"
ARIADNE_SCHEDULE_STALE_COUNT = f"sum({ARIADNE_SCHEDULE_STALE}) or on() vector(0)"
ARIADNE_SCHEDULE_MISSING_COUNT = f"count({ARIADNE_SCHEDULE_MISSING}) or on() vector(0)"
ARIADNE_SCHEDULE_FAILED_COUNT = f"sum({ARIADNE_SCHEDULE_FAILED}) or on() vector(0)"
ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))'
ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
2026-01-21 13:37:36 -03:00
ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
2026-01-21 02:57:40 -03:00
ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
2026-01-21 11:29:29 -03:00
ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[$__interval]))'
ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[$__interval]))'
ARIADNE_TASK_WARNINGS_SERIES = (
'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)'
)
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = f"(time() - {ARIADNE_ALL_SCHEDULE_LAST_SUCCESS}) / 3600"
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = f"(time() - {ARIADNE_ALL_SCHEDULE_LAST_ERROR}) / 3600"
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
f"(time() - max_over_time({ARIADNE_ALL_SCHEDULE_LAST_SUCCESS}[$__range])) / 3600"
)
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
f"(time() - max_over_time({ARIADNE_ALL_SCHEDULE_LAST_ERROR}[$__range])) / 3600"
)
ARIADNE_FAST_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
f"(time() - max_over_time({ARIADNE_FAST_SCHEDULE_LAST_SUCCESS}[$__range])) / 3600"
)
ARIADNE_FAST_SCHEDULE_NEXT_RUN_HOURS = f"(({ARIADNE_ALL_SCHEDULE_NEXT_RUN} - time()) / 3600)"
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
PLATFORM_TEST_SUITE_NAMES = [
"ariadne",
"metis",
"ananke",
"atlasbot",
"pegasus",
"soteria",
"titan_iac",
"bstein_home",
"data_prepper",
]
PLATFORM_TEST_SUCCESS_STATUS = "ok|passed|success"
PLATFORM_TEST_CI_JOB = "platform-quality-ci"
PLATFORM_TEST_EXPORT_FILTER = f'exported_job="{PLATFORM_TEST_CI_JOB}"'
PLATFORM_TEST_SUITE_VALUE_BY_NAME = {
"ariadne": "ariadne",
"metis": "metis",
"ananke": "ananke",
"atlasbot": "atlasbot",
"pegasus": "pegasus|pegasus-health|pegasus_health",
"soteria": "soteria",
"titan_iac": "titan_iac|titan-iac",
"bstein_home": "bstein_home|bstein-home",
"data_prepper": "data_prepper|data-prepper",
}
PLATFORM_TEST_SUITE_MATCHER = "|".join(
PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite) for suite in PLATFORM_TEST_SUITE_NAMES
)
PLATFORM_TEST_SUITE_CANONICAL_MATCHER = "|".join(PLATFORM_TEST_SUITE_NAMES)
PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER = PLATFORM_TEST_SUITE_MATCHER
PLATFORM_TEST_SUCCESS_EVENTS_30D = (
f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[30d])) or on() vector(0))'
)
PLATFORM_TEST_TOTAL_EVENTS_30D = (
f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[30d])) or on() vector(0))'
)
PLATFORM_TEST_SUCCESS_EVENTS_7D = (
f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[7d])) or on() vector(0))'
)
PLATFORM_TEST_TOTAL_EVENTS_7D = (
f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[7d])) or on() vector(0))'
)
PLATFORM_TEST_SUCCESS_EVENTS_24H = (
f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h])) or on() vector(0))'
)
PLATFORM_TEST_TOTAL_EVENTS_24H = (
f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h])) or on() vector(0))'
)
TEST_SUCCESS_RATE = (
f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_30D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_30D}), 1)"
2026-01-21 13:37:36 -03:00
)
TEST_SUCCESS_RATE_7D = (
f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_7D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_7D}), 1)"
)
TEST_SUCCESS_RATE_24H = (
f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_24H}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_24H}), 1)"
)
TEST_FAILURES_24H_TOTAL = (
f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status!~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h])) or on() vector(0))'
)
PLATFORM_TEST_FAILURES_24H_BY_SUITE = (
f'sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status!~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h])))'
)
PLATFORM_TEST_ACTIVITY_30D = (
f'sum by (suite, status) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[30d]))'
)
PLATFORM_TEST_RUNS_24H_TOTAL = PLATFORM_TEST_TOTAL_EVENTS_24H
PLATFORM_TEST_ACTIVE_SUITES_24H = (
f'sum((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h])) > 0)) '
"or on() vector(0)"
2026-01-21 13:37:36 -03:00
)
PLATFORM_TEST_POINT_WINDOW = "1h"
PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS = [
{
"refId": chr(ord("A") + index),
"expr": (
f'(100 * (sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite)}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}'
f'[{PLATFORM_TEST_POINT_WINDOW}]))) / '
f'clamp_min((sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite)}",{PLATFORM_TEST_EXPORT_FILTER}}}[{PLATFORM_TEST_POINT_WINDOW}]))), 1))'
),
"legendFormat": suite,
}
for index, suite in enumerate(PLATFORM_TEST_SUITE_NAMES)
]
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
f'sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h]))) '
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h]))), 1))'
)
QUALITY_GATE_SUITE_INDEX_30D = (
f'sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[30d]))'
)
QUALITY_GATE_COVERAGE_BY_SUITE = (
f'(max by (suite) ({{__name__=~".*_quality_gate_coverage_percent",{PLATFORM_TEST_EXPORT_FILTER}}})) '
f'or on(suite) (max by (suite) (platform_quality_gate_workspace_line_coverage_percent{{{PLATFORM_TEST_EXPORT_FILTER}}}))'
)
QUALITY_GATE_COVERAGE_BY_SUITE_WITH_MISSING = (
f"({QUALITY_GATE_COVERAGE_BY_SUITE}) or on(suite) (0 * ({QUALITY_GATE_SUITE_INDEX_30D}) - 1)"
)
QUALITY_GATE_COVERAGE_GAP_BY_SUITE = (
f"clamp_min(95 - ({QUALITY_GATE_COVERAGE_BY_SUITE}), 0)"
)
QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE = (
f"max by (suite) (platform_quality_gate_source_lines_over_500_total{{{PLATFORM_TEST_EXPORT_FILTER}}})"
)
QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE_WITH_MISSING = (
f"({QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE}) or on(suite) (0 * ({QUALITY_GATE_SUITE_INDEX_30D}) - 1)"
)
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))"
ANANKE_SELECTOR = 'job="ananke-power"'
ANANKE_UPS_DB_NAME = "Pyrphoros"
ANANKE_UPS_DB_NODE = "titan-db"
ANANKE_UPS_TETHYS_NAME = "Statera"
ANANKE_UPS_TETHYS_NODE = "titan-24"
ANANKE_UPS_DB_SELECTOR = f'{ANANKE_SELECTOR},source="{ANANKE_UPS_DB_NAME}"'
ANANKE_UPS_TETHYS_SELECTOR = f'{ANANKE_SELECTOR},source="{ANANKE_UPS_TETHYS_NAME}"'
ANANKE_UPS_ON_BATTERY = f"sum(ananke_ups_on_battery{{{ANANKE_SELECTOR}}}) or on() vector(0)"
ANANKE_UPS_LOW_BATTERY = f"sum(ananke_ups_low_battery{{{ANANKE_SELECTOR}}}) or on() vector(0)"
ANANKE_UPS_RUNTIME_MIN = f"min(ananke_ups_runtime_seconds{{{ANANKE_SELECTOR}}}) or on() vector(0)"
ANANKE_UPS_RUNTIME_HEADROOM_PERCENT = (
f"100 * min(ananke_ups_runtime_seconds{{{ANANKE_SELECTOR}}}) / "
f"clamp_min(max(ananke_ups_threshold_seconds{{{ANANKE_SELECTOR}}}), 1)"
)
ANANKE_UPS_TRIGGER_COUNT_1D = f"increase(ananke_shutdown_triggers_total{{{ANANKE_SELECTOR}}}[1d]) or on() vector(0)"
ANANKE_UPS_RUNTIME_DB = (
f'max(ananke_ups_runtime_seconds{{{ANANKE_UPS_DB_SELECTOR}}}) or on() vector(0)'
)
ANANKE_UPS_RUNTIME_TETHYS = (
f'max(ananke_ups_runtime_seconds{{{ANANKE_UPS_TETHYS_SELECTOR}}}) or on() vector(0)'
)
ANANKE_UPS_ON_BATTERY_DB = (
f'max(ananke_ups_on_battery{{{ANANKE_UPS_DB_SELECTOR}}}) or on() vector(0)'
)
ANANKE_UPS_ON_BATTERY_TETHYS = (
f'max(ananke_ups_on_battery{{{ANANKE_UPS_TETHYS_SELECTOR}}}) or on() vector(0)'
)
ANANKE_UPS_BATTERY_CHARGE_DB = (
f'max(ananke_ups_battery_charge_percent{{{ANANKE_UPS_DB_SELECTOR}}}) or on() vector(0)'
)
ANANKE_UPS_BATTERY_CHARGE_TETHYS = (
f'max(ananke_ups_battery_charge_percent{{{ANANKE_UPS_TETHYS_SELECTOR}}}) or on() vector(0)'
)
ANANKE_UPS_LOAD_DB = (
f'max(ananke_ups_load_percent{{{ANANKE_UPS_DB_SELECTOR}}}) or on() vector(0)'
)
ANANKE_UPS_LOAD_TETHYS = (
f'max(ananke_ups_load_percent{{{ANANKE_UPS_TETHYS_SELECTOR}}}) or on() vector(0)'
)
ANANKE_UPS_DRAW_WATTS_DB = (
f'max((ananke_ups_load_percent{{{ANANKE_UPS_DB_SELECTOR}}} '
f'* ananke_ups_power_nominal_watts{{{ANANKE_UPS_DB_SELECTOR}}}) / 100) or on() vector(0)'
)
ANANKE_UPS_DRAW_WATTS_TETHYS = (
f'max((ananke_ups_load_percent{{{ANANKE_UPS_TETHYS_SELECTOR}}} '
f'* ananke_ups_power_nominal_watts{{{ANANKE_UPS_TETHYS_SELECTOR}}}) / 100) or on() vector(0)'
)
ANANKE_UPS_DRAW_WATTS_DB_SERIES = (
f'((ananke_ups_load_percent{{{ANANKE_UPS_DB_SELECTOR}}} '
f'* ananke_ups_power_nominal_watts{{{ANANKE_UPS_DB_SELECTOR}}}) / 100)'
)
ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES = (
f'((ananke_ups_load_percent{{{ANANKE_UPS_TETHYS_SELECTOR}}} '
f'* ananke_ups_power_nominal_watts{{{ANANKE_UPS_TETHYS_SELECTOR}}}) / 100)'
)
ANANKE_UPS_RUNTIME_BY_SOURCE = f"ananke_ups_runtime_seconds{{{ANANKE_SELECTOR}}}"
ANANKE_UPS_LOAD_BY_SOURCE = f"ananke_ups_load_percent{{{ANANKE_SELECTOR}}}"
ANANKE_UPS_CHARGE_BY_SOURCE = f"ananke_ups_battery_charge_percent{{{ANANKE_SELECTOR}}}"
ANANKE_UPS_TRIGGER_BY_SOURCE = f"ananke_ups_trigger_active{{{ANANKE_SELECTOR}}}"
CLIMATE_SENSOR_COUNT = "count(typhon_temperature_celsius) or on() vector(0)"
CLIMATE_TEMP_MAX = "max(typhon_temperature_celsius) or on() vector(0)"
CLIMATE_PRESSURE_CURRENT = "max(typhon_vpd_kpa) or on() vector(0)"
CLIMATE_HUMIDITY_MAX = "max(typhon_relative_humidity_percent) or on() vector(0)"
CLIMATE_TEMP_SERIES = "typhon_temperature_celsius"
CLIMATE_PRESSURE_SERIES = "typhon_vpd_kpa"
CLIMATE_HUMIDITY_SERIES = "typhon_relative_humidity_percent"
CLIMATE_DEWPOINT_SERIES = (
"(243.12 * (ln(clamp_min(typhon_relative_humidity_percent, 1) / 100) + "
"(17.62 * typhon_temperature_celsius) / (243.12 + typhon_temperature_celsius))) / "
"(17.62 - (ln(clamp_min(typhon_relative_humidity_percent, 1) / 100) + "
"(17.62 * typhon_temperature_celsius) / (243.12 + typhon_temperature_celsius)))"
)
CLIMATE_DEWPOINT_CURRENT = f"max({CLIMATE_DEWPOINT_SERIES}) or on() vector(0)"
CLIMATE_FAN_OUTLET_CURRENT = (
'max(typhon_fan_speed_level{fan_group="outlet"}) or on() vector(0)'
)
CLIMATE_FAN_INSIDE_INLET_CURRENT = (
'max(typhon_fan_speed_level{fan_group="inside_inlet"}) or on() vector(0)'
)
CLIMATE_FAN_OUTSIDE_INLET_CURRENT = (
'max(typhon_fan_speed_level{fan_group="outside_inlet"}) or on() vector(0)'
)
CLIMATE_FAN_INTERIOR_CURRENT = (
'max(typhon_fan_speed_level{fan_group="interior"}) or on() vector(0)'
)
CLIMATE_FAN_OUTLET_SERIES = (
'typhon_fan_speed_level{fan_group="outlet"}'
)
CLIMATE_FAN_INSIDE_INLET_SERIES = (
'typhon_fan_speed_level{fan_group="inside_inlet"}'
)
CLIMATE_FAN_OUTSIDE_INLET_SERIES = (
'typhon_fan_speed_level{fan_group="outside_inlet"}'
)
CLIMATE_FAN_INTERIOR_SERIES = (
'typhon_fan_speed_level{fan_group="interior"}'
)
POSTGRES_CONN_USED = (
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")'
)
POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))'
2026-01-21 13:37:36 -03:00
ONEOFF_JOB_OWNER = (
'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
)
ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})'
ONEOFF_JOB_POD_AGE_HOURS = (
'((time() - kube_pod_start_time{pod!=""}) / 3600) '
f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} '
'* on(namespace,pod) group_left(phase) '
'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
)
2025-11-18 10:47:24 -03:00
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
GPU_NODE_REGEX = "|".join(GPU_NODES)
2025-11-17 18:55:11 -03:00
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
TRAEFIK_NET_INGRESS = (
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
" or on() vector(0)"
)
TRAEFIK_NET_EGRESS = (
'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
" or on() vector(0)"
)
NET_CLUSTER_RX = (
'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
" or on() vector(0)"
)
NET_CLUSTER_TX = (
'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
" or on() vector(0)"
)
PHYSICAL_NET_FILTER = 'device!~"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*"'
NET_NODE_RX_PHYS = (
f'sum(rate(node_network_receive_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
)
NET_NODE_TX_PHYS = (
f'sum(rate(node_network_transmit_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
)
NET_TOTAL_EXPR = NET_NODE_TX_PHYS
NET_INGRESS_EXPR = NET_NODE_RX_PHYS
NET_EGRESS_EXPR = NET_NODE_TX_PHYS
NET_INTERNAL_EXPR = (
'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!=""}[5m]) '
'+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
' or on() vector(0)'
)
APISERVER_5XX_RATE = 'sum(rate(apiserver_request_total{code=~"5.."}[5m]))'
APISERVER_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
)
ETCD_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_TOTAL_5M = "sum(rate(traefik_entrypoint_requests_total[5m]))"
TRAEFIK_SUCCESS_5M = 'sum(rate(traefik_entrypoint_requests_total{code!~"5.."}[5m]))'
TRAEFIK_SLI_5M = f"({TRAEFIK_SUCCESS_5M}) / clamp_min({TRAEFIK_TOTAL_5M}, 1)"
TRAEFIK_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_P95_LATENCY_MS = (
"histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
SLO_AVAILABILITY = 0.999
def traefik_sli(window):
total = f'sum(rate(traefik_entrypoint_requests_total[{window}]))'
success = f'sum(rate(traefik_entrypoint_requests_total{{code!~"5.."}}[{window}]))'
return f"({success}) / clamp_min({total}, 1)"
def traefik_burn(window):
sli = traefik_sli(window)
return f"(1 - ({sli})) / {1 - SLO_AVAILABILITY}"
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Panel factories
# ---------------------------------------------------------------------------
2025-11-17 16:27:38 -03:00
def stat_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
decimals=None,
2025-11-17 16:27:38 -03:00
thresholds=None,
text_mode="value",
legend=None,
instant=False,
2025-11-17 16:27:38 -03:00
value_suffix=None,
links=None,
targets=None,
field_overrides=None,
description=None,
orientation=None,
wide_layout=None,
2025-11-17 16:27:38 -03:00
):
"""Return a Grafana stat panel definition."""
defaults = {
"color": {"mode": "thresholds"},
"mappings": [],
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "rgba(115, 115, 115, 1)", "value": None},
{"color": "green", "value": 1},
],
},
"unit": unit,
2025-11-17 16:27:38 -03:00
"custom": {"displayMode": "auto"},
}
2025-11-17 16:27:38 -03:00
if value_suffix:
defaults["custom"]["valueSuffix"] = value_suffix
if decimals is not None:
defaults["decimals"] = decimals
target_list = targets if targets is not None else [{"expr": expr, "refId": "A"}]
panel = {
"id": panel_id,
"type": "stat",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": target_list,
"fieldConfig": {"defaults": defaults, "overrides": field_overrides or []},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"textMode": text_mode,
},
}
if orientation:
panel["options"]["orientation"] = orientation
if wide_layout is not None:
panel["options"]["wideLayout"] = wide_layout
if legend and len(panel["targets"]) == 1:
panel["targets"][0]["legendFormat"] = legend
if instant:
for t in panel["targets"]:
t.setdefault("instant", True)
2025-11-17 16:27:38 -03:00
if links:
panel["links"] = links
if description:
panel["description"] = description
return panel
2025-11-18 12:11:47 -03:00
def gauge_panel(
panel_id,
title,
expr,
grid,
*,
min_value=0,
max_value=1,
thresholds=None,
links=None,
):
return {
"id": panel_id,
"type": "gauge",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {
"defaults": {
"min": min_value,
"max": max_value,
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": max_value},
],
},
},
"overrides": [],
},
"options": {
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"orientation": "auto",
"showThresholdMarkers": False,
"showThresholdLabels": False,
},
**({"links": links} if links else {}),
}
2025-11-17 16:27:38 -03:00
def timeseries_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
max_value=None,
2025-11-17 16:27:38 -03:00
legend=None,
legend_display="table",
legend_placement="bottom",
legend_calcs=None,
time_from=None,
links=None,
targets=None,
field_overrides=None,
description=None,
2025-11-17 16:27:38 -03:00
):
"""Return a Grafana time-series panel definition."""
target_list = targets if targets is not None else [{"expr": expr, "refId": "A"}]
panel = {
"id": panel_id,
"type": "timeseries",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": target_list,
"fieldConfig": {"defaults": {"unit": unit}, "overrides": field_overrides or []},
"options": {
"legend": {
"displayMode": legend_display,
"placement": legend_placement,
},
"tooltip": {"mode": "multi"},
},
}
if max_value is not None:
panel["fieldConfig"]["defaults"]["max"] = max_value
if legend and len(panel["targets"]) == 1:
panel["targets"][0]["legendFormat"] = legend
if legend_calcs:
panel["options"]["legend"]["calcs"] = legend_calcs
if time_from:
panel["timeFrom"] = time_from
2025-11-17 16:27:38 -03:00
if links:
panel["links"] = links
if description:
panel["description"] = description
return panel
2025-11-17 16:27:38 -03:00
def table_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
transformations=None,
instant=False,
options=None,
filterable=True,
footer=None,
format=None,
description=None,
2025-11-17 16:27:38 -03:00
):
"""Return a Grafana table panel definition."""
# Optional PromQL subquery helpers in expr: share(), etc.
panel_options = {"showHeader": True, "columnFilters": False}
if options:
panel_options.update(options)
if footer is not None:
panel_options["footer"] = footer
field_defaults = {"unit": unit, "custom": {"filterable": filterable}}
target = {"expr": expr, "refId": "A", **({"instant": True} if instant else {})}
if format:
target["format"] = format
panel = {
"id": panel_id,
"type": "table",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [target],
"fieldConfig": {"defaults": field_defaults, "overrides": []},
"options": panel_options,
}
if transformations:
panel["transformations"] = transformations
if description:
panel["description"] = description
return panel
def pie_panel(panel_id, title, expr, grid, *, links=None, description=None):
2025-11-17 16:27:38 -03:00
"""Return a pie chart panel with readable namespace labels."""
panel = {
"id": panel_id,
"type": "piechart",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {"mode": "palette-classic"},
},
"overrides": [],
},
"options": {
"legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie",
"displayLabels": [],
"tooltip": {"mode": "single"},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
},
}
if links:
panel["links"] = links
if description:
panel["description"] = description
return panel
def namespace_scope_variable(var_name, label):
options = [
{
"text": "workload namespaces only",
"value": NAMESPACE_SCOPE_WORKLOAD,
"selected": True,
},
{"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False},
{
"text": "infrastructure namespaces only",
"value": NAMESPACE_SCOPE_INFRA,
"selected": False,
},
]
query = (
"workload namespaces only : "
+ NAMESPACE_SCOPE_WORKLOAD
+ ",all namespaces : "
+ NAMESPACE_SCOPE_ALL
+ ",infrastructure namespaces only : "
+ NAMESPACE_SCOPE_INFRA
)
return {
"name": var_name,
"label": label,
"type": "custom",
"query": query,
"current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True},
"options": options,
"hide": 2,
"multi": False,
"includeAll": False,
"refresh": 1,
"sort": 0,
"skipUrlSync": False,
}
def namespace_scope_links(var_name):
def with_value(value):
encoded = urllib.parse.quote(value, safe="")
params = []
for other in NAMESPACE_SCOPE_VARS:
if other == var_name:
params.append(f"var-{other}={encoded}")
else:
params.append(f"var-{other}=${{{other}}}")
return "?" + "&".join(params)
return [
{"title": "Workload namespaces only", "url": with_value(NAMESPACE_SCOPE_WORKLOAD), "targetBlank": False},
{"title": "All namespaces", "url": with_value(NAMESPACE_SCOPE_ALL), "targetBlank": False},
{
"title": "Infrastructure namespaces only",
"url": with_value(NAMESPACE_SCOPE_INFRA),
"targetBlank": False,
},
]
def testing_suite_variable():
options = [
{
"text": suite,
"value": PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite),
"selected": False,
}
for suite in PLATFORM_TEST_SUITE_NAMES
]
query = ",".join(
f"{suite} : {PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite)}"
for suite in PLATFORM_TEST_SUITE_NAMES
)
return {
"name": "suite",
"label": "Suite",
"type": "custom",
"query": query,
"current": {"text": "All", "value": "$__all", "selected": True},
"options": options,
"hide": 0,
"multi": False,
"includeAll": True,
"allValue": PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER,
"refresh": 1,
"sort": 1,
"skipUrlSync": False,
}
def bargauge_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
2026-01-21 11:29:29 -03:00
legend=None,
links=None,
limit=None,
2026-01-21 11:29:29 -03:00
sort_order="desc",
thresholds=None,
decimals=None,
instant=False,
overrides=None,
):
"""Return a bar gauge panel with label-aware reduction."""
2026-01-21 15:12:53 -03:00
cleaned_expr = expr.strip()
if not cleaned_expr.startswith(("sort(", "sort_desc(")):
if sort_order == "desc":
expr = f"sort_desc({expr})"
elif sort_order == "asc":
expr = f"sort({expr})"
panel = {
"id": panel_id,
"type": "bargauge",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [
2026-01-21 11:29:29 -03:00
{
"expr": expr,
"refId": "A",
"legendFormat": legend or "{{node}}",
**({"instant": True} if instant else {}),
}
],
"fieldConfig": {
"defaults": {
"unit": unit,
"min": 0,
"max": 100 if unit == "percent" else None,
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 70},
{"color": "red", "value": 85},
],
},
},
"overrides": [],
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": False,
},
},
}
if overrides:
panel["fieldConfig"]["overrides"].extend(overrides)
if decimals is not None:
panel["fieldConfig"]["defaults"]["decimals"] = decimals
if links:
panel["links"] = links
# Keep bars ordered by value descending for readability.
panel["transformations"] = [
{
"id": "sortBy",
2026-01-21 11:29:29 -03:00
"options": {"fields": ["Value"], "order": sort_order},
}
]
if limit:
panel["transformations"].append({"id": "limit", "options": {"limit": limit}})
return panel
def text_panel(panel_id, title, content, grid):
return {
"id": panel_id,
"type": "text",
"title": title,
"gridPos": grid,
"datasource": None,
"options": {"mode": "markdown", "content": content},
}
DASHBOARD_LINK_TITLES = {
"atlas-overview": "Open Atlas Overview",
"atlas-pods": "Open Atlas Pods",
"atlas-nodes": "Open Atlas Nodes",
"atlas-storage": "Open Atlas Storage",
"atlas-network": "Open Atlas Network",
"atlas-mail": "Open Atlas Mail",
"atlas-jobs": "Open Atlas Testing",
"atlas-power": "Open Atlas Power",
"atlas-gpu": "Open Atlas GPU",
}
def link_to(uid):
return [
{
"title": DASHBOARD_LINK_TITLES.get(uid, f"Open {uid} dashboard"),
"url": f"/d/{uid}",
"targetBlank": True,
}
]
# ---------------------------------------------------------------------------
# Dashboard builders
# ---------------------------------------------------------------------------
def build_overview():
panels = []
count_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
}
2026-01-21 13:37:36 -03:00
age_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 6},
{"color": "orange", "value": 24},
{"color": "red", "value": 48},
],
}
row1_stats = [
{
"id": 2,
"title": "Control Plane Ready",
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
"kind": "gauge",
"max_value": CONTROL_TOTAL,
"thresholds": {
2025-11-17 19:24:03 -03:00
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
2025-11-17 19:24:03 -03:00
{"color": "green", "value": CONTROL_TOTAL},
],
},
},
{
"id": 3,
"title": "Control Plane Workloads",
"expr": CONTROL_WORKLOADS_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 5,
"title": "Stuck Terminating",
"expr": STUCK_TERMINATING_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 27,
2025-12-19 13:46:34 -03:00
"title": "Atlas Availability",
"expr": UPTIME_PERCENT_EXPR,
"kind": "stat",
"thresholds": UPTIME_PERCENT_THRESHOLDS,
"unit": "percentunit",
"decimals": 4,
"text_mode": "value",
},
{
"id": 4,
"title": "Problem Pods",
"expr": PROBLEM_PODS_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 6,
"title": "CrashLoop / ImagePull",
"expr": CRASHLOOP_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 1,
"title": "Workers Ready",
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
"kind": "gauge",
"max_value": WORKER_TOTAL,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": WORKER_TOTAL - 2},
{"color": "yellow", "value": WORKER_TOTAL - 1},
{"color": "green", "value": WORKER_TOTAL},
],
},
},
]
def gauge_grid(idx):
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
x = sum(GAUGE_WIDTHS[:idx])
return width, x
for idx, item in enumerate(row1_stats):
panel_id = item["id"]
width, x = gauge_grid(idx)
grid = {"h": 5, "w": width, "x": x, "y": 0}
kind = item.get("kind", "gauge")
if kind == "stat":
panels.append(
stat_panel(
panel_id,
item["title"],
item["expr"],
grid,
thresholds=item.get("thresholds"),
legend=None,
links=item.get("links"),
text_mode=item.get("text_mode", "value"),
value_suffix=item.get("value_suffix"),
unit=item.get("unit", "none"),
decimals=item.get("decimals"),
)
)
else:
panels.append(
gauge_panel(
panel_id,
item["title"],
item["expr"],
grid,
min_value=0,
max_value=item.get("max_value", 5),
thresholds=item.get("thresholds"),
links=item.get("links"),
)
)
2025-11-17 16:27:38 -03:00
top_health_panels = [
2025-11-17 21:20:19 -03:00
(7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"),
(8, "Hottest node: RAM", topk_with_node(node_mem_expr()), "percent"),
2025-11-17 20:19:20 -03:00
(9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"),
(10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"),
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
(24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
(25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"),
(26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"),
2025-11-17 16:27:38 -03:00
]
for idx, (panel_id, title, expr, unit) in enumerate(top_health_panels):
is_hottest_panel = panel_id in {7, 8, 9, 10}
2025-11-17 16:27:38 -03:00
panels.append(
stat_panel(
panel_id,
title,
2025-11-17 20:19:20 -03:00
f"{expr}",
{"h": 2, "w": 3, "x": 3 * idx, "y": 5},
2025-11-17 16:27:38 -03:00
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
text_mode="name_and_value" if is_hottest_panel else "value",
legend="{{node}}" if is_hottest_panel else None,
instant=is_hottest_panel,
links=link_to("atlas-storage" if panel_id in {23, 24, 25, 26} else "atlas-nodes"),
2025-11-17 16:27:38 -03:00
)
)
mail_bounce_rate_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 5},
{"color": "orange", "value": 8},
{"color": "red", "value": 10},
],
}
mail_limit_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 70},
{"color": "orange", "value": 85},
{"color": "red", "value": 95},
],
}
mail_success_thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 90},
{"color": "yellow", "value": 95},
{"color": "green", "value": 98},
],
}
status_mapping = [
{
"type": "value",
"options": {
"0": {"text": "⚡ Charging"},
"1": {"text": "🔋 Discharging"},
},
}
]
panels.append(
stat_panel(
40,
"UPS Current Load",
None,
{"h": 6, "w": 4, "x": 0, "y": 12},
unit="none",
decimals=1,
text_mode="name_and_value",
targets=[
{"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Draw (W)", "instant": True},
{"refId": "B", "expr": ANANKE_UPS_RUNTIME_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Discharge", "instant": True},
{"refId": "C", "expr": ANANKE_UPS_ON_BATTERY_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Status", "instant": True},
{"refId": "D", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Draw (W)", "instant": True},
{"refId": "E", "expr": ANANKE_UPS_RUNTIME_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Discharge", "instant": True},
{"refId": "F", "expr": ANANKE_UPS_ON_BATTERY_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Status", "instant": True},
],
field_overrides=[
{
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Draw (W)"},
"properties": [{"id": "unit", "value": "watt"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}],
},
{
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Draw (W)"},
"properties": [{"id": "unit", "value": "watt"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}],
},
{
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Discharge"},
"properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}],
},
{
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Discharge"},
"properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}],
},
{
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Status"},
"properties": [{"id": "mappings", "value": status_mapping}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}],
},
{
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Status"},
"properties": [{"id": "mappings", "value": status_mapping}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}],
},
],
orientation="horizontal",
wide_layout=True,
links=link_to("atlas-power"),
description="Per-UPS live snapshot: current draw, discharge, and charging/discharging status.",
)
)
panels.append(
timeseries_panel(
41,
"UPS History (Power Draw)",
None,
{"h": 6, "w": 4, "x": 4, "y": 12},
unit="watt",
targets=[
{"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB_SERIES, "legendFormat": ANANKE_UPS_DB_NAME},
{"refId": "B", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES, "legendFormat": ANANKE_UPS_TETHYS_NAME},
],
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-power"),
)
)
panels.append(
stat_panel(
42,
"Current Climate",
None,
{"h": 6, "w": 4, "x": 8, "y": 12},
unit="none",
decimals=2,
text_mode="value",
targets=[
{"refId": "A", "expr": CLIMATE_TEMP_MAX, "legendFormat": "Tent Temp (°C)", "instant": True},
{"refId": "B", "expr": CLIMATE_PRESSURE_CURRENT, "legendFormat": "Tent VPD (kPa)", "instant": True},
{"refId": "C", "expr": CLIMATE_HUMIDITY_MAX, "legendFormat": "Tent RH (%)", "instant": True},
{"refId": "D", "expr": CLIMATE_DEWPOINT_CURRENT, "legendFormat": "Dew Point (°C)", "instant": True},
],
field_overrides=[
{"matcher": {"id": "byName", "options": "Tent Temp (°C)"}, "properties": [{"id": "unit", "value": "celsius"}]},
{"matcher": {"id": "byName", "options": "Tent VPD (kPa)"}, "properties": [{"id": "unit", "value": "suffix:kPa"}]},
{"matcher": {"id": "byName", "options": "Tent RH (%)"}, "properties": [{"id": "unit", "value": "percent"}]},
{"matcher": {"id": "byName", "options": "Dew Point (°C)"}, "properties": [{"id": "unit", "value": "celsius"}]},
],
links=link_to("atlas-power"),
description="Current tent temperature, humidity, VPD, and dew point.",
orientation="horizontal",
wide_layout=True,
)
)
panels.append(
timeseries_panel(
43,
"Climate History",
None,
{"h": 6, "w": 4, "x": 12, "y": 12},
unit="celsius",
targets=[
{"refId": "A", "expr": CLIMATE_TEMP_SERIES, "legendFormat": "Temperature (°C)"},
{"refId": "B", "expr": CLIMATE_HUMIDITY_SERIES, "legendFormat": "Humidity (%)"},
{"refId": "C", "expr": CLIMATE_PRESSURE_SERIES, "legendFormat": "VPD (kPa)"},
{"refId": "D", "expr": CLIMATE_DEWPOINT_SERIES, "legendFormat": "Dew Point (°C)"},
],
field_overrides=[
{
"matcher": {"id": "byName", "options": "Humidity (%)"},
"properties": [
{"id": "unit", "value": "percent"},
],
},
{
"matcher": {"id": "byName", "options": "VPD (kPa)"},
"properties": [
{"id": "unit", "value": "none"},
{"id": "custom.axisPlacement", "value": "right"},
{"id": "custom.axisLabel", "value": "kPa"},
{"id": "decimals", "value": 2},
],
}
],
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-power"),
description="Two-axis chart: tent temperature/humidity/dew point (left axis) and VPD in kPa (right axis).",
)
)
panels.append(
stat_panel(
140,
"Fan Activity",
None,
{"h": 6, "w": 4, "x": 16, "y": 12},
unit="none",
decimals=0,
text_mode="name_and_value",
targets=[
{"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Inside Outlet", "instant": True},
{"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "Inside Inlet", "instant": True},
{"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Outside Inlet", "instant": True},
{"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior Fans", "instant": True},
],
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 7},
{"color": "red", "value": 9},
],
},
orientation="horizontal",
wide_layout=True,
links=link_to("atlas-power"),
)
)
panels.append(
timeseries_panel(
141,
"Fan History (0-10)",
None,
{"h": 6, "w": 4, "x": 20, "y": 12},
unit="none",
max_value=10,
targets=[
{"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Inside Outlet"},
{"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "Inside Inlet"},
{"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Outside Inlet"},
{"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior Fans"},
],
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-power"),
)
)
panels.append(
bargauge_panel(
44,
"One-off Job Pods (age hours)",
ONEOFF_JOB_POD_AGE_HOURS,
{"h": 5, "w": 6, "x": 0, "y": 7},
unit="h",
instant=True,
legend="{{namespace}}/{{pod}}",
thresholds=age_thresholds,
limit=12,
decimals=2,
links=link_to("atlas-jobs"),
)
)
panels.append(
{
"id": 45,
"type": "timeseries",
"title": "Ariadne Attempts / Failures",
"datasource": PROM_DS,
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 7},
"targets": [
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
],
"fieldConfig": {
"defaults": {"unit": "none"},
"overrides": [
{
"matcher": {"id": "byName", "options": "Attempts"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
],
},
{
"matcher": {"id": "byName", "options": "Failures"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
],
},
],
},
"options": {
"legend": {"displayMode": "table", "placement": "right"},
"tooltip": {"mode": "multi"},
},
"links": link_to("atlas-jobs"),
}
)
test_success = timeseries_panel(
46,
"Platform Test Success Rate",
None,
{"h": 5, "w": 6, "x": 12, "y": 7},
unit="percent",
targets=PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS,
legend_display="table",
legend_placement="right",
legend_calcs=["lastNotNull"],
links=link_to("atlas-jobs"),
)
test_success["fieldConfig"]["defaults"]["min"] = 0
test_success["fieldConfig"]["defaults"]["max"] = 100
test_success["fieldConfig"]["defaults"]["custom"] = {
"drawStyle": "line",
"lineInterpolation": "linear",
"lineWidth": 2,
"fillOpacity": 10,
"showPoints": "always",
"pointSize": 4,
"spanNulls": True,
}
test_success["timeFrom"] = "7d"
test_success["description"] = (
"Per-run interval pass points (0-100) for each software suite over the last 7 days. Points are connected to show trend; missing-run intervals are ignored."
)
panels.append(test_success)
panels.append(
bargauge_panel(
47,
"PVC Backup Health / Age",
PVC_BACKUP_AGE_HOURS_BY_PVC,
{"h": 5, "w": 6, "x": 18, "y": 7},
unit="h",
instant=True,
legend="{{namespace}}/{{pvc}}",
sort_order="desc",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 20},
{"color": "orange", "value": 40},
{"color": "red", "value": 50},
],
},
)
)
panels[-1]["links"] = link_to("atlas-storage")
panels[-1]["description"] = (
"Oldest successful backup age in hours by PVC with nightly cadence thresholds (green <=20h, yellow <40h, orange <50h, red >=50h). PVCs with missing or unhealthy backup state are forced to 999h so critical bars stay visible."
)
panels.append(
stat_panel(
30,
"Mail Sent (1d)",
'max(postmark_outbound_sent{window="1d"})',
{"h": 2, "w": 4, "x": 0, "y": 18},
unit="none",
links=link_to("atlas-mail"),
)
)
panels.append(
{
"id": 31,
"type": "stat",
"title": "Mail Bounces (1d)",
"datasource": PROM_DS,
"gridPos": {"h": 2, "w": 4, "x": 8, "y": 18},
"targets": [
{
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
"refId": "A",
"legendFormat": "Rate",
},
{
"expr": 'max(postmark_outbound_bounced{window="1d"})',
"refId": "B",
"legendFormat": "Count",
},
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"custom": {"displayMode": "auto"},
"thresholds": mail_bounce_rate_thresholds,
"unit": "none",
},
"overrides": [
{
"matcher": {"id": "byName", "options": "Rate"},
"properties": [{"id": "unit", "value": "percent"}],
},
{
"matcher": {"id": "byName", "options": "Count"},
"properties": [{"id": "unit", "value": "none"}],
},
],
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"textMode": "name_and_value",
},
"links": link_to("atlas-mail"),
}
)
panels.append(
stat_panel(
32,
"Mail Success Rate (1d)",
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
{"h": 2, "w": 4, "x": 4, "y": 18},
unit="percent",
thresholds=mail_success_thresholds,
decimals=1,
links=link_to("atlas-mail"),
)
)
panels.append(
stat_panel(
33,
"Mail Limit Used (30d)",
"max(postmark_sending_limit_used_percent)",
{"h": 2, "w": 4, "x": 12, "y": 18},
unit="percent",
thresholds=mail_limit_thresholds,
decimals=1,
links=link_to("atlas-mail"),
)
)
panels.append(
stat_panel(
34,
"Postgres Connections Used",
POSTGRES_CONN_USED,
{"h": 2, "w": 4, "x": 16, "y": 18},
decimals=0,
text_mode="name_and_value",
legend="{{conn}}",
instant=True,
)
)
panels.append(
stat_panel(
35,
"Postgres Hottest Connections",
POSTGRES_CONN_HOTTEST,
{"h": 2, "w": 4, "x": 20, "y": 18},
unit="none",
decimals=0,
text_mode="name_and_value",
legend="{{datname}}",
instant=True,
)
)
cpu_scope = "$namespace_scope_cpu"
gpu_scope = "$namespace_scope_gpu"
ram_scope = "$namespace_scope_ram"
panels.append(
2025-11-17 16:27:38 -03:00
pie_panel(
11,
"Namespace CPU Share",
namespace_cpu_share_expr(cpu_scope),
{"h": 9, "w": 8, "x": 0, "y": 23},
links=namespace_scope_links("namespace_scope_cpu"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
2025-11-17 23:12:16 -03:00
)
)
panels.append(
pie_panel(
2025-11-17 23:42:55 -03:00
12,
"Namespace GPU Share",
namespace_gpu_share_expr(gpu_scope),
{"h": 9, "w": 8, "x": 8, "y": 23},
links=namespace_scope_links("namespace_scope_gpu"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
)
)
panels.append(
pie_panel(
13,
"Namespace RAM Share",
namespace_ram_share_expr(ram_scope),
{"h": 9, "w": 8, "x": 16, "y": 23},
links=namespace_scope_links("namespace_scope_ram"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
)
)
worker_filter = f"{WORKER_REGEX}"
panels.append(
timeseries_panel(
14,
"Worker Node CPU",
node_cpu_expr(worker_filter),
{"h": 12, "w": 12, "x": 0, "y": 39},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
2025-11-17 16:27:38 -03:00
links=link_to("atlas-nodes"),
)
)
panels.append(
timeseries_panel(
15,
"Worker Node RAM",
node_mem_expr(worker_filter),
{"h": 12, "w": 12, "x": 12, "y": 39},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
2025-11-17 16:27:38 -03:00
links=link_to("atlas-nodes"),
)
)
panels.append(
timeseries_panel(
16,
"Control plane CPU",
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 0, "y": 51},
unit="percent",
legend="{{node}}",
2025-11-17 16:27:38 -03:00
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
17,
"Control plane RAM",
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 12, "y": 51},
unit="percent",
legend="{{node}}",
2025-11-17 16:27:38 -03:00
legend_display="table",
legend_placement="right",
)
)
panels.append(
pie_panel(
28,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 10, "w": 12, "x": 0, "y": 61},
)
)
panels.append(
bargauge_panel(
29,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 10, "w": 12, "x": 12, "y": 61},
unit="none",
limit=12,
decimals=0,
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 100},
],
},
instant=True,
)
)
panels.append(
timeseries_panel(
18,
"Cluster Ingress Throughput",
2025-11-17 16:27:38 -03:00
NET_INGRESS_EXPR,
{"h": 7, "w": 8, "x": 0, "y": 32},
2025-11-17 18:55:11 -03:00
unit="Bps",
legend="Ingress (Traefik)",
2025-11-17 16:27:38 -03:00
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-network"),
)
)
panels.append(
timeseries_panel(
19,
"Cluster Egress Throughput",
2025-11-17 16:27:38 -03:00
NET_EGRESS_EXPR,
{"h": 7, "w": 8, "x": 8, "y": 32},
2025-11-17 18:55:11 -03:00
unit="Bps",
legend="Egress (Traefik)",
2025-11-17 16:27:38 -03:00
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-network"),
)
)
panels.append(
timeseries_panel(
20,
"Intra-Cluster Throughput",
NET_INTERNAL_EXPR,
{"h": 7, "w": 8, "x": 16, "y": 32},
unit="Bps",
legend="Internal traffic",
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-network"),
)
)
panels.append(
timeseries_panel(
21,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 16, "w": 12, "x": 0, "y": 71},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
2025-11-17 16:27:38 -03:00
time_from="30d",
links=link_to("atlas-storage"),
)
)
panels.append(
timeseries_panel(
22,
"Nodes Closest to Full Astraios Disks",
astraios_usage_expr(),
{"h": 16, "w": 12, "x": 12, "y": 71},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
time_from="1w",
links=link_to("atlas-storage"),
)
)
return {
"uid": "atlas-overview",
"title": "Atlas Overview",
2025-11-17 16:27:38 -03:00
"folderUid": PUBLIC_FOLDER,
"editable": False,
2025-11-17 16:27:38 -03:00
"annotations": {"list": []},
"panels": panels,
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "overview"],
"templating": {
"list": [
namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
]
},
"time": {"from": "now-1h", "to": "now"},
"refresh": "1m",
"links": [
{
"title": "Atlas Testing (Internal)",
"url": "/d/atlas-jobs",
"targetBlank": False,
}
],
}
def build_pods_dashboard():
panels = []
panels.append(
2025-11-17 16:27:38 -03:00
stat_panel(
1,
"Problem Pods",
2025-11-17 16:27:38 -03:00
PROBLEM_PODS_EXPR,
{"h": 4, "w": 6, "x": 0, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
)
)
panels.append(
stat_panel(
2,
"CrashLoop / ImagePull",
CRASHLOOP_EXPR,
{"h": 4, "w": 6, "x": 6, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
)
)
panels.append(
stat_panel(
3,
"Stuck Terminating (>10m)",
2025-11-17 16:27:38 -03:00
STUCK_TERMINATING_EXPR,
{"h": 4, "w": 6, "x": 12, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
)
)
panels.append(
stat_panel(
4,
"Control Plane Workloads",
2025-11-17 16:27:38 -03:00
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
{"h": 4, "w": 6, "x": 18, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
)
)
panels.append(
table_panel(
5,
"Pods Not Running",
2025-11-17 16:27:38 -03:00
PROBLEM_TABLE_EXPR,
{"h": 10, "w": 24, "x": 0, "y": 4},
unit="s",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
2025-11-17 16:27:38 -03:00
6,
"CrashLoop / ImagePull",
2025-11-17 16:27:38 -03:00
CRASHLOOP_TABLE_EXPR,
{"h": 10, "w": 24, "x": 0, "y": 14},
unit="s",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
2025-11-17 16:27:38 -03:00
7,
"Terminating >10m",
STUCK_TABLE_EXPR,
{"h": 10, "w": 24, "x": 0, "y": 24},
unit="s",
transformations=[
2025-11-17 16:27:38 -03:00
{"id": "labelsToFields", "options": {}},
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
],
)
)
panels.append(
pie_panel(
8,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 8, "w": 12, "x": 12, "y": 34},
)
)
panels.append(
bargauge_panel(
9,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 8, "w": 12, "x": 0, "y": 34},
unit="none",
limit=12,
decimals=0,
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 100},
],
},
instant=True,
)
)
2025-12-13 16:36:25 -03:00
share_expr = (
'(sum by (namespace,node) (kube_pod_info{pod!="" , node!=""}) '
2025-12-13 17:29:55 -03:00
'/ on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=""}), 1) * 100)'
2025-12-13 16:36:25 -03:00
)
rank_terms = [
f"(sum by (node) (kube_node_info{{node=\"{node}\"}}) * 0 + {idx * 1e-3})"
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
]
rank_expr = " or ".join(rank_terms)
score_expr = f"{share_expr} + on(node) group_left() ({rank_expr})"
2025-12-13 17:29:55 -03:00
mask_expr = (
f"{score_expr} == bool on(namespace) group_left() "
f"(max by (namespace) ({score_expr}))"
)
panels.append(
table_panel(
10,
"Namespace Plurality by Node v27",
(
2025-12-13 17:29:55 -03:00
f"{share_expr} * on(namespace,node) group_left() "
2025-12-13 18:25:03 -03:00
f"({mask_expr})"
),
{"h": 8, "w": 24, "x": 0, "y": 42},
unit="percent",
transformations=[
{"id": "labelsToFields", "options": {}},
{"id": "organize", "options": {"excludeByName": {"Time": True}}},
2025-12-13 18:25:03 -03:00
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 0}},
{
"id": "sortBy",
"options": {"fields": ["Value"], "order": "desc"},
},
{
"id": "groupBy",
"options": {
"fields": {
"namespace": {
"aggregations": [
{"field": "Value", "operation": "max"},
{"field": "node", "operation": "first"},
]
}
},
"rowBy": ["namespace"],
},
},
],
instant=True,
options={"showColumnFilters": False},
filterable=False,
footer={"show": False, "fields": "", "calcs": []},
format="table",
)
)
return {
"uid": "atlas-pods",
"title": "Atlas Pods",
2025-11-17 16:27:38 -03:00
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "pods"],
}
def build_nodes_dashboard():
panels = []
2025-11-17 16:27:38 -03:00
panels.append(
stat_panel(
1,
"Worker Nodes Ready",
2025-11-17 16:27:38 -03:00
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
{"h": 4, "w": 8, "x": 0, "y": 0},
value_suffix=WORKER_SUFFIX,
)
)
panels.append(
stat_panel(
2,
"Control Plane Ready",
2025-11-17 16:27:38 -03:00
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
{"h": 4, "w": 8, "x": 8, "y": 0},
value_suffix=CONTROL_SUFFIX,
)
)
panels.append(
stat_panel(
3,
"Control Plane Workloads",
2025-11-17 16:27:38 -03:00
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
{"h": 4, "w": 8, "x": 16, "y": 0},
)
)
panels.append(
stat_panel(
9,
"API Server 5xx rate",
APISERVER_5XX_RATE,
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 0.05},
{"color": "orange", "value": 0.2},
{"color": "red", "value": 0.5},
],
},
decimals=3,
)
)
panels.append(
stat_panel(
10,
"API Server P99 latency",
APISERVER_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 250},
{"color": "orange", "value": 400},
{"color": "red", "value": 600},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
11,
"etcd P99 latency",
ETCD_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 100},
{"color": "red", "value": 200},
],
},
decimals=1,
)
)
2025-11-17 16:27:38 -03:00
panels.append(
timeseries_panel(
4,
"Node CPU",
node_cpu_expr(),
{"h": 9, "w": 24, "x": 0, "y": 8},
2025-11-17 16:27:38 -03:00
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
5,
"Node RAM",
node_mem_expr(),
{"h": 9, "w": 24, "x": 0, "y": 17},
2025-11-17 16:27:38 -03:00
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
6,
"Control Plane (incl. titan-db) CPU",
2025-11-17 16:27:38 -03:00
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 0, "y": 26},
2025-11-17 16:27:38 -03:00
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
7,
"Control Plane (incl. titan-db) RAM",
2025-11-17 16:27:38 -03:00
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 12, "y": 26},
2025-11-17 16:27:38 -03:00
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
8,
"Root Filesystem Usage",
2025-11-17 16:27:38 -03:00
root_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 35},
2025-11-17 16:27:38 -03:00
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
panels.append(
timeseries_panel(
9,
"Astraios Usage",
astraios_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 44},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
2025-11-17 16:27:38 -03:00
time_from="30d",
)
)
return {
"uid": "atlas-nodes",
"title": "Atlas Nodes",
2025-11-17 16:27:38 -03:00
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "nodes"],
}
def build_storage_dashboard():
panels = []
2025-11-17 16:27:38 -03:00
panels.append(
stat_panel(
1,
"Astreae Usage",
2025-11-17 16:27:38 -03:00
astreae_usage_expr("/mnt/astreae"),
{"h": 5, "w": 6, "x": 0, "y": 0},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
)
)
panels.append(
stat_panel(
2,
"Asteria Usage",
2025-11-17 16:27:38 -03:00
astreae_usage_expr("/mnt/asteria"),
{"h": 5, "w": 6, "x": 6, "y": 0},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
)
)
panels.append(
stat_panel(
3,
"Astreae Free",
2025-11-17 16:27:38 -03:00
astreae_free_expr("/mnt/astreae"),
{"h": 5, "w": 6, "x": 12, "y": 0},
2025-11-17 18:55:11 -03:00
unit="decbytes",
2025-11-17 16:27:38 -03:00
)
)
panels.append(
stat_panel(
4,
"Asteria Free",
2025-11-17 16:27:38 -03:00
astreae_free_expr("/mnt/asteria"),
{"h": 5, "w": 6, "x": 18, "y": 0},
2025-11-17 18:55:11 -03:00
unit="decbytes",
2025-11-17 16:27:38 -03:00
)
)
panels.append(
timeseries_panel(
5,
"Astreae Per-Node Usage",
2025-11-17 18:55:11 -03:00
filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX),
2025-11-17 16:27:38 -03:00
{"h": 9, "w": 12, "x": 0, "y": 5},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
panels.append(
timeseries_panel(
6,
"Asteria Per-Node Usage",
2025-11-17 18:55:11 -03:00
filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX),
2025-11-17 16:27:38 -03:00
{"h": 9, "w": 12, "x": 12, "y": 5},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
panels.append(
timeseries_panel(
7,
"Astreae Usage History",
2025-11-17 16:27:38 -03:00
astreae_usage_expr("/mnt/astreae"),
{"h": 9, "w": 12, "x": 0, "y": 14},
unit="percent",
time_from="90d",
)
)
panels.append(
timeseries_panel(
8,
"Asteria Usage History",
2025-11-17 16:27:38 -03:00
astreae_usage_expr("/mnt/asteria"),
{"h": 9, "w": 12, "x": 12, "y": 14},
unit="percent",
time_from="90d",
)
)
panels.append(
stat_panel(
30,
"Maintenance Sweepers Ready",
'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100',
{"h": 4, "w": 12, "x": 0, "y": 44},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
)
)
panels.append(
stat_panel(
31,
"Maintenance Cron Freshness (s)",
2026-01-14 06:41:34 -03:00
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})',
{"h": 4, "w": 12, "x": 12, "y": 44},
unit="s",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 3600},
{"color": "red", "value": 10800},
],
},
)
)
return {
"uid": "atlas-storage",
"title": "Atlas Storage",
2025-11-17 16:27:38 -03:00
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "storage"],
}
2025-11-17 16:27:38 -03:00
def build_network_dashboard():
panels = []
panels.append(
stat_panel(
1,
"Ingress Success Rate (5m)",
TRAEFIK_SLI_5M,
{"h": 4, "w": 6, "x": 0, "y": 0},
unit="percentunit",
decimals=2,
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 0.995},
{"color": "yellow", "value": 0.999},
{"color": "green", "value": 0.9995},
],
},
)
)
panels.append(
stat_panel(
2,
"Error Budget Burn (1h)",
traefik_burn("1h"),
{"h": 4, "w": 6, "x": 6, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
)
)
panels.append(
stat_panel(
3,
"Error Budget Burn (6h)",
traefik_burn("6h"),
{"h": 4, "w": 6, "x": 12, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
)
)
panels.append(
stat_panel(
4,
"Edge P99 Latency (ms)",
TRAEFIK_P99_LATENCY_MS,
{"h": 4, "w": 6, "x": 18, "y": 0},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 200},
{"color": "orange", "value": 350},
{"color": "red", "value": 500},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
5,
"Ingress Traffic",
NET_INGRESS_EXPR,
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="Bps",
)
2025-11-17 16:27:38 -03:00
)
panels.append(
stat_panel(
6,
"Egress Traffic",
NET_EGRESS_EXPR,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="Bps",
)
2025-11-17 16:27:38 -03:00
)
panels.append(
stat_panel(
7,
"Intra-Cluster Traffic",
NET_INTERNAL_EXPR,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="Bps",
)
)
2025-11-17 16:27:38 -03:00
panels.append(
timeseries_panel(
8,
"Per-Node Throughput",
f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
{"h": 8, "w": 24, "x": 0, "y": 8},
2025-11-17 18:55:11 -03:00
unit="Bps",
2025-11-17 16:27:38 -03:00
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
table_panel(
9,
"Top Namespaces",
2025-11-17 16:27:38 -03:00
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
{"h": 9, "w": 12, "x": 0, "y": 16},
2025-11-17 18:55:11 -03:00
unit="Bps",
2025-11-17 16:27:38 -03:00
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
10,
"Top Pods",
2025-11-17 16:27:38 -03:00
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
{"h": 9, "w": 12, "x": 12, "y": 16},
2025-11-17 18:55:11 -03:00
unit="Bps",
2025-11-17 16:27:38 -03:00
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
timeseries_panel(
11,
"Traefik Routers (req/s)",
2025-11-17 18:55:11 -03:00
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
{"h": 9, "w": 12, "x": 0, "y": 25},
2025-11-17 16:27:38 -03:00
unit="req/s",
legend="{{router}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
12,
"Traefik Entrypoints (req/s)",
2025-11-17 16:27:38 -03:00
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
{"h": 9, "w": 12, "x": 12, "y": 25},
2025-11-17 16:27:38 -03:00
unit="req/s",
legend="{{entrypoint}}",
legend_display="table",
legend_placement="right",
)
)
return {
"uid": "atlas-network",
"title": "Atlas Network",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "network"],
}
def build_mail_dashboard():
panels = []
bounce_rate_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 5},
{"color": "orange", "value": 8},
{"color": "red", "value": 10},
],
}
limit_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 70},
{"color": "orange", "value": 85},
{"color": "red", "value": 95},
],
}
success_thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 90},
{"color": "yellow", "value": 95},
{"color": "green", "value": 98},
],
}
panels.append(
stat_panel(
1,
"Sent (1d)",
'max(postmark_outbound_sent{window="1d"})',
{"h": 4, "w": 6, "x": 0, "y": 0},
decimals=0,
)
)
panels.append(
stat_panel(
2,
"Sent (7d)",
'max(postmark_outbound_sent{window="7d"})',
{"h": 4, "w": 6, "x": 6, "y": 0},
decimals=0,
)
)
panels.append(
{
"id": 3,
"type": "stat",
"title": "Mail Bounces (1d)",
"datasource": PROM_DS,
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
"targets": [
{
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
"refId": "A",
"legendFormat": "Rate",
},
{
"expr": 'max(postmark_outbound_bounced{window="1d"})',
"refId": "B",
"legendFormat": "Count",
},
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"custom": {"displayMode": "auto"},
"thresholds": bounce_rate_thresholds,
"unit": "none",
},
"overrides": [
{
"matcher": {"id": "byName", "options": "Rate"},
"properties": [{"id": "unit", "value": "percent"}],
},
{
"matcher": {"id": "byName", "options": "Count"},
"properties": [{"id": "unit", "value": "none"}],
},
],
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"textMode": "name_and_value",
},
}
)
panels.append(
stat_panel(
4,
"Success Rate (1d)",
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
{"h": 4, "w": 6, "x": 18, "y": 0},
unit="percent",
thresholds=success_thresholds,
decimals=1,
)
)
panels.append(
stat_panel(
5,
"Limit Used (30d)",
"max(postmark_sending_limit_used_percent)",
{"h": 4, "w": 6, "x": 0, "y": 4},
thresholds=limit_thresholds,
unit="percent",
decimals=1,
)
)
panels.append(
stat_panel(
6,
"Send Limit (30d)",
"max(postmark_sending_limit)",
{"h": 4, "w": 6, "x": 6, "y": 4},
decimals=0,
)
)
panels.append(
stat_panel(
7,
"Last Success",
"max(postmark_last_success_timestamp_seconds)",
{"h": 4, "w": 6, "x": 12, "y": 4},
unit="dateTimeAsIso",
decimals=0,
)
)
panels.append(
stat_panel(
8,
"Exporter Errors",
"sum(postmark_request_errors_total)",
{"h": 4, "w": 6, "x": 18, "y": 4},
decimals=0,
)
)
panels.append(
timeseries_panel(
13,
"Bounce Rate (1d vs 7d)",
"max by (window) (postmark_outbound_bounce_rate)",
{"h": 8, "w": 12, "x": 0, "y": 12},
unit="percent",
legend="{{window}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
14,
"Bounced (1d vs 7d)",
"max by (window) (postmark_outbound_bounced)",
{"h": 8, "w": 12, "x": 12, "y": 12},
unit="none",
legend="{{window}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
15,
"Sent (1d vs 7d)",
"max by (window) (postmark_outbound_sent)",
{"h": 8, "w": 12, "x": 0, "y": 20},
unit="none",
legend="{{window}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
16,
"Exporter Errors",
"sum(postmark_request_errors_total)",
{"h": 8, "w": 12, "x": 12, "y": 20},
unit="none",
)
)
return {
"uid": "atlas-mail",
"title": "Atlas Mail",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-30d", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "mail"],
}
def build_jobs_dashboard():
panels = []
suite_var = "${suite:regex}"
success = PLATFORM_TEST_SUCCESS_STATUS
exported = PLATFORM_TEST_EXPORT_FILTER
runs_selector = f'suite=~"{suite_var}",{exported}'
runs_success_selector = f'{runs_selector},status=~"{success}"'
runs_failure_selector = f'{runs_selector},status!~"{success}"'
checks_selector = f'__name__=~".*_quality_gate_checks_total",suite=~"{suite_var}",{exported}'
tests_selector = f'__name__=~".*_quality_gate_tests_total",suite=~"{suite_var}",{exported}'
coverage_metric_selector = f'__name__=~".*_quality_gate_coverage_percent",suite=~"{suite_var}",{exported}'
workspace_coverage_selector = f'suite=~"{suite_var}",{exported}'
smell_selector = f'suite=~"{suite_var}",{exported}'
suite_universe = " or ".join(
f'label_replace(vector(1), "suite", "{suite}", "__name__", ".*")'
for suite in PLATFORM_TEST_SUITE_NAMES
)
runs_24h = f'(sum(increase(platform_quality_gate_runs_total{{{runs_selector}}}[24h])) or on() vector(0))'
runs_30d = f'(sum(increase(platform_quality_gate_runs_total{{{runs_selector}}}[30d])) or on() vector(0))'
success_24h = (
f'(sum(increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[24h])) or on() vector(0))'
)
success_30d = (
f'(sum(increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[30d])) or on() vector(0))'
)
failures_24h = (
f'(sum(increase(platform_quality_gate_runs_total{{{runs_failure_selector}}}[24h])) or on() vector(0))'
)
success_rate_24h = f"100 * ({success_24h}) / clamp_min(({runs_24h}), 1)"
success_rate_30d = f"100 * ({success_30d}) / clamp_min(({runs_30d}), 1)"
success_rate_by_suite_24h = (
f'sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[24h]))) '
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[24h]))), 1))'
)
failures_by_suite_24h = (
f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_failure_selector}}}[24h]))'
)
success_history_by_suite_core = (
f'100 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[$__interval])) '
f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[$__interval]))), 1))'
)
success_history_by_suite = (
f'({success_history_by_suite_core}) '
f'or on(suite) (0 * sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[$__range])))'
)
coverage_by_suite = (
f'(max by (suite) ({{{coverage_metric_selector}}})) '
f'or on(suite) (max by (suite) (platform_quality_gate_workspace_line_coverage_percent{{{workspace_coverage_selector}}}))'
)
coverage_with_missing = (
f"({coverage_by_suite}) or on(suite) (0 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[30d]))) - 1)"
)
coverage_gap = f"clamp_min(95 - ({coverage_by_suite}), 0)"
smell_by_suite = f'max by (suite) (platform_quality_gate_source_lines_over_500_total{{{smell_selector}}})'
smell_with_missing = (
f"({smell_by_suite}) or on(suite) (0 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[30d]))) - 1)"
)
average_coverage = f"(avg(({coverage_by_suite})) or on() vector(0))"
suites_loc_violating = f'(sum((({smell_by_suite}) > bool 0)) or on() vector(0))'
checks_failed_total = f'(sum({{{checks_selector},result!~"{success}"}}) or on() vector(0))'
checks_failed_tests = (
f'(sum(count by (suite) ({{{checks_selector},check=~"tests|unit|build",result!~"{success}"}})) or on() vector(0))'
)
checks_failed_coverage = (
f'(sum(count by (suite) ({{{checks_selector},check=~"coverage",result!~"{success}"}})) or on() vector(0))'
)
checks_failed_loc = (
f'(sum(count by (suite) ({{{checks_selector},check=~"loc|smell",result!~"{success}"}})) or on() vector(0))'
)
checks_failed_docs = (
f'(sum(count by (suite) ({{{checks_selector},check=~"docs|naming|hygiene|lint|docs_naming",result!~"{success}"}})) or on() vector(0))'
)
checks_failed_gate = (
f'(sum(count by (suite) ({{{checks_selector},check=~"gate|glue|gate_glue",result!~"{success}"}})) or on() vector(0))'
)
checks_failed_sonarqube = (
f'(sum(count by (suite) ({{{checks_selector},check=~"sonarqube|sonar",result!~"{success}"}})) or on() vector(0))'
)
checks_failed_supply_chain = (
f'(sum(count by (suite) ({{{checks_selector},check=~"ironbank|supply_chain|image_compliance|artifact_security",result!~"{success}"}})) or on() vector(0))'
)
check_regex_tests = "tests|unit|build"
check_regex_coverage = "coverage"
check_regex_loc = "loc|smell"
check_regex_style = "docs|naming|hygiene|lint|docs_naming|style"
check_regex_gate_glue = "gate|glue|gate_glue"
check_regex_sonarqube = "sonarqube|sonar"
check_regex_supply_chain = "ironbank|supply_chain|image_compliance|artifact_security"
def _check_state_series(regex: str, failed: bool) -> str:
state = f'result!~"{success}"' if failed else f'result=~"{success}"'
core = (
f'sum by (suite) (max_over_time(({{{checks_selector},check=~"{regex}",{state}}})[$__interval]))'
)
return f'({core}) or on(suite) (0 * ({suite_universe}))'
missing_tests_by_suite = (
f'(({suite_universe}) unless on(suite) count by (suite) ({{__name__=~".*_quality_gate_tests_total",{exported}}}))'
)
missing_checks_by_suite = (
f'(({suite_universe}) unless on(suite) count by (suite) ({{__name__=~".*_quality_gate_checks_total",{exported}}}))'
)
missing_coverage_by_suite = (
f'(({suite_universe}) unless on(suite) count by (suite) (platform_quality_gate_workspace_line_coverage_percent{{{exported}}}))'
)
missing_loc_by_suite = (
f'(({suite_universe}) unless on(suite) count by (suite) (platform_quality_gate_source_lines_over_500_total{{{exported}}}))'
)
success_thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 80},
{"color": "yellow", "value": 95},
{"color": "green", "value": 99},
],
}
failures_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 3},
{"color": "red", "value": 5},
],
}
coverage_gap_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 5},
{"color": "red", "value": 10},
],
}
smell_thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "green", "value": 0},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 3},
{"color": "red", "value": 5},
],
}
missing_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
}
panels.append(
stat_panel(
2,
"Success Rate (24h)",
success_rate_24h,
{"h": 5, "w": 4, "x": 0, "y": 0},
unit="percent",
decimals=2,
instant=True,
thresholds=success_thresholds,
)
)
panels.append(
stat_panel(
3,
"Success Rate (30d)",
success_rate_30d,
{"h": 5, "w": 4, "x": 4, "y": 0},
unit="percent",
decimals=2,
instant=True,
thresholds=success_thresholds,
)
)
panels.append(
stat_panel(
4,
"Failures (24h)",
failures_24h,
{"h": 5, "w": 4, "x": 8, "y": 0},
unit="none",
instant=True,
thresholds=failures_thresholds,
)
)
panels.append(
stat_panel(
5,
"Runs (24h)",
runs_24h,
{"h": 5, "w": 4, "x": 12, "y": 0},
unit="none",
instant=True,
thresholds={
"mode": "absolute",
"steps": [{"color": "red", "value": None}, {"color": "green", "value": 1}],
},
)
)
panels.append(
stat_panel(
6,
"Avg Coverage (%)",
average_coverage,
{"h": 5, "w": 4, "x": 16, "y": 0},
unit="percent",
decimals=2,
instant=True,
thresholds=success_thresholds,
)
)
panels.append(
stat_panel(
7,
"Suites with LOC >500",
suites_loc_violating,
{"h": 5, "w": 4, "x": 20, "y": 0},
unit="none",
instant=True,
thresholds=smell_thresholds,
)
)
panels.append(
stat_panel(
19,
"Failing Tests",
checks_failed_tests,
{"h": 4, "w": 3, "x": 0, "y": 5},
unit="none",
instant=True,
thresholds=failures_thresholds,
)
)
panels.append(
stat_panel(
20,
"Failing Coverage",
checks_failed_coverage,
{"h": 4, "w": 3, "x": 3, "y": 5},
unit="none",
instant=True,
thresholds=failures_thresholds,
)
)
panels.append(
stat_panel(
21,
"Failing LOC",
checks_failed_loc,
{"h": 4, "w": 3, "x": 6, "y": 5},
unit="none",
instant=True,
thresholds=failures_thresholds,
)
)
panels.append(
stat_panel(
22,
"Failing Docs/Naming",
checks_failed_docs,
{"h": 4, "w": 3, "x": 9, "y": 5},
unit="none",
instant=True,
thresholds=failures_thresholds,
)
)
panels.append(
stat_panel(
23,
"Failing Gate/Glue",
checks_failed_gate,
{"h": 4, "w": 3, "x": 12, "y": 5},
unit="none",
instant=True,
thresholds=failures_thresholds,
)
)
panels.append(
stat_panel(
24,
"Failing SonarQube",
checks_failed_sonarqube,
{"h": 4, "w": 3, "x": 15, "y": 5},
unit="none",
instant=True,
thresholds=failures_thresholds,
)
)
panels.append(
stat_panel(
25,
"Failing Supply Chain",
checks_failed_supply_chain,
{"h": 4, "w": 3, "x": 18, "y": 5},
unit="none",
instant=True,
thresholds=failures_thresholds,
)
)
panels.append(
stat_panel(
26,
"Total Failing Checks",
checks_failed_total,
{"h": 4, "w": 3, "x": 21, "y": 5},
unit="none",
instant=True,
thresholds=failures_thresholds,
)
)
panels.append(
bargauge_panel(
8,
"Failures by Suite (24h)",
failures_by_suite_24h,
{"h": 8, "w": 8, "x": 0, "y": 9},
unit="none",
instant=True,
legend="{{suite}}",
thresholds=failures_thresholds,
)
)
panels.append(
bargauge_panel(
9,
"Success Rate by Suite (24h)",
success_rate_by_suite_24h,
{"h": 8, "w": 8, "x": 8, "y": 9},
unit="percent",
instant=True,
legend="{{suite}}",
sort_order="asc",
thresholds=success_thresholds,
decimals=2,
)
)
coverage_gap_panel = bargauge_panel(
10,
"Coverage Gap to 95% by Suite",
coverage_gap,
{"h": 8, "w": 8, "x": 16, "y": 9},
unit="percent",
instant=True,
legend="{{suite}}",
sort_order="desc",
thresholds=coverage_gap_thresholds,
decimals=2,
)
coverage_gap_panel["description"] = "Gap from the 95% target. 0 means the suite is at or above target."
panels.append(coverage_gap_panel)
history_panel = timeseries_panel(
11,
"Success History by Suite",
success_history_by_suite,
{"h": 8, "w": 24, "x": 0, "y": 17},
unit="percent",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
)
history_panel["fieldConfig"]["defaults"]["min"] = 0
history_panel["fieldConfig"]["defaults"]["max"] = 100
history_panel["fieldConfig"]["defaults"]["custom"] = {
"drawStyle": "line",
"lineInterpolation": "linear",
"lineWidth": 2,
"fillOpacity": 8,
"showPoints": "always",
"pointSize": 3,
"spanNulls": True,
}
panels.append(history_panel)
panels.append(
timeseries_panel(
12,
"Run Outcomes (Selected Scope)",
None,
{"h": 8, "w": 8, "x": 0, "y": 25},
unit="none",
targets=[
{
"refId": "A",
"expr": f'sum(increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[$__interval])) or on() vector(0)',
"legendFormat": "Success",
},
{
"refId": "B",
"expr": f'sum(increase(platform_quality_gate_runs_total{{{runs_failure_selector}}}[$__interval])) or on() vector(0)',
"legendFormat": "Failure",
},
{
"refId": "C",
"expr": f'sum(increase(platform_quality_gate_runs_total{{{runs_selector}}}[$__interval])) or on() vector(0)',
"legendFormat": "Total",
},
],
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "sum"],
)
)
panels.append(
timeseries_panel(
13,
"Coverage & LOC History (Selected Scope)",
None,
{"h": 8, "w": 8, "x": 8, "y": 25},
unit="none",
targets=[
{
"refId": "A",
"expr": f'max_over_time(platform_quality_gate_workspace_line_coverage_percent{{{workspace_coverage_selector}}}[$__interval])',
"legendFormat": "{{suite}} coverage %",
},
{
"refId": "B",
"expr": f'max_over_time(platform_quality_gate_source_lines_over_500_total{{{smell_selector}}}[$__interval])',
"legendFormat": "{{suite}} files >500 LOC",
},
],
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
run_mix_panel = pie_panel(
14,
"Run Status Mix (30d)",
f'sum by (status) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[30d]))',
{"h": 8, "w": 8, "x": 16, "y": 25},
)
run_mix_panel["targets"][0]["legendFormat"] = "{{status}}"
run_mix_panel["fieldConfig"]["defaults"]["unit"] = "none"
panels.append(run_mix_panel)
panels.append(
timeseries_panel(
130,
"Fail Trend: Tests",
_check_state_series(check_regex_tests, True),
{"h": 6, "w": 3, "x": 0, "y": 33},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
131,
"Fail Trend: Coverage",
_check_state_series(check_regex_coverage, True),
{"h": 6, "w": 3, "x": 3, "y": 33},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
132,
"Fail Trend: LOC",
_check_state_series(check_regex_loc, True),
{"h": 6, "w": 3, "x": 6, "y": 33},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
133,
"Fail Trend: Style",
_check_state_series(check_regex_style, True),
{"h": 6, "w": 3, "x": 9, "y": 33},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
134,
"Fail Trend: Gate Glue",
_check_state_series(check_regex_gate_glue, True),
{"h": 6, "w": 3, "x": 12, "y": 33},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
135,
"Fail Trend: SonarQube",
_check_state_series(check_regex_sonarqube, True),
{"h": 6, "w": 3, "x": 15, "y": 33},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
136,
"Fail Trend: Supply Chain",
_check_state_series(check_regex_supply_chain, True),
{"h": 6, "w": 3, "x": 18, "y": 33},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
138,
"Pass Trend: Tests",
_check_state_series(check_regex_tests, False),
{"h": 6, "w": 3, "x": 0, "y": 39},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
139,
"Pass Trend: Coverage",
_check_state_series(check_regex_coverage, False),
{"h": 6, "w": 3, "x": 3, "y": 39},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
140,
"Pass Trend: LOC",
_check_state_series(check_regex_loc, False),
{"h": 6, "w": 3, "x": 6, "y": 39},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
141,
"Pass Trend: Style",
_check_state_series(check_regex_style, False),
{"h": 6, "w": 3, "x": 9, "y": 39},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
142,
"Pass Trend: Gate Glue",
_check_state_series(check_regex_gate_glue, False),
{"h": 6, "w": 3, "x": 12, "y": 39},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
143,
"Pass Trend: SonarQube",
_check_state_series(check_regex_sonarqube, False),
{"h": 6, "w": 3, "x": 15, "y": 39},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
timeseries_panel(
144,
"Pass Trend: Supply Chain",
_check_state_series(check_regex_supply_chain, False),
{"h": 6, "w": 3, "x": 18, "y": 39},
unit="none",
legend="{{suite}}",
legend_display="list",
legend_placement="bottom",
legend_calcs=["lastNotNull", "max"],
)
)
panels.append(
bargauge_panel(
15,
"Latest Test Counters (Suite + Result)",
f'sum by (suite, result) ({{{tests_selector}}})',
{"h": 6, "w": 3, "x": 21, "y": 39},
unit="none",
instant=True,
legend="{{suite}} · {{result}}",
sort_order="desc",
limit=24,
)
)
coverage_panel = bargauge_panel(
17,
"Coverage by Suite (Latest, gate 95)",
coverage_with_missing,
{"h": 8, "w": 12, "x": 0, "y": 45},
unit="percent",
instant=True,
legend="{{suite}}",
sort_order="asc",
thresholds=success_thresholds,
decimals=2,
)
coverage_panel["fieldConfig"]["defaults"]["mappings"] = [
{"type": "value", "options": {"-1": {"text": "missing"}}}
]
panels.append(coverage_panel)
smell_panel = bargauge_panel(
18,
"Files >500 LOC by Suite (Latest)",
smell_with_missing,
{"h": 8, "w": 12, "x": 12, "y": 45},
unit="none",
instant=True,
legend="{{suite}}",
sort_order="desc",
thresholds=smell_thresholds,
)
smell_panel["fieldConfig"]["defaults"]["mappings"] = [
{"type": "value", "options": {"-1": {"text": "missing"}}}
]
panels.append(smell_panel)
panels.append(
bargauge_panel(
27,
"Missing Tests Metrics by Suite",
missing_tests_by_suite,
{"h": 7, "w": 6, "x": 0, "y": 53},
unit="none",
instant=True,
legend="{{suite}}",
sort_order="desc",
thresholds=missing_thresholds,
decimals=0,
)
)
panels.append(
bargauge_panel(
28,
"Missing Checks Metrics by Suite",
missing_checks_by_suite,
{"h": 7, "w": 6, "x": 6, "y": 53},
unit="none",
instant=True,
legend="{{suite}}",
sort_order="desc",
thresholds=missing_thresholds,
decimals=0,
)
)
panels.append(
bargauge_panel(
29,
"Missing Coverage Metrics by Suite",
missing_coverage_by_suite,
{"h": 7, "w": 6, "x": 12, "y": 53},
unit="none",
instant=True,
legend="{{suite}}",
sort_order="desc",
thresholds=missing_thresholds,
decimals=0,
)
)
panels.append(
bargauge_panel(
30,
"Missing LOC Metrics by Suite",
missing_loc_by_suite,
{"h": 7, "w": 6, "x": 18, "y": 53},
unit="none",
instant=True,
legend="{{suite}}",
sort_order="desc",
thresholds=missing_thresholds,
decimals=0,
)
)
panels.append(
stat_panel(
31,
"SonarQube API Up",
"(max(sonarqube_up) or on() vector(0))",
{"h": 6, "w": 4, "x": 0, "y": 60},
unit="none",
instant=True,
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "green", "value": 1},
],
},
)
)
panels.append(
stat_panel(
32,
"Sonar Projects (Selected)",
f'(count(sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}"}}) or on() vector(0))',
{"h": 6, "w": 4, "x": 4, "y": 60},
unit="none",
instant=True,
thresholds=failures_thresholds,
)
)
panels.append(
stat_panel(
33,
"Sonar Gate Fetch Errors",
"(max(sonarqube_quality_gate_fetch_errors_total) or on() vector(0))",
{"h": 6, "w": 4, "x": 8, "y": 60},
unit="none",
instant=True,
thresholds=failures_thresholds,
)
)
sonar_status_mix_panel = pie_panel(
34,
"Sonar Gate Status Mix (Selected)",
f'count by (status) (sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}"}})',
{"h": 6, "w": 6, "x": 12, "y": 60},
)
sonar_status_mix_panel["targets"][0]["legendFormat"] = "{{status}}"
panels.append(sonar_status_mix_panel)
panels.append(
bargauge_panel(
35,
"Projects Failing Sonar Gate",
f'sort_desc(count by (project_key) (sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}",status!~"OK|ok"}}))',
{"h": 6, "w": 6, "x": 18, "y": 60},
unit="none",
instant=True,
legend="{{project_key}}",
sort_order="desc",
thresholds=failures_thresholds,
)
)
return {
"uid": "atlas-jobs",
"title": "Atlas Testing",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-30d", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "testing", "quality-gate", "ci"],
"templating": {
"list": [
testing_suite_variable(),
]
},
}
def build_power_dashboard():
panels = []
status_mapping = [
{
"type": "value",
"options": {
"0": {"text": "⚡ Charging"},
"1": {"text": "🔋 Discharging"},
},
}
]
panels.append(
stat_panel(
1,
"UPS Current Load",
None,
{"h": 8, "w": 12, "x": 0, "y": 0},
unit="none",
decimals=1,
text_mode="name_and_value",
targets=[
{"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Draw (W)", "instant": True},
{"refId": "B", "expr": ANANKE_UPS_RUNTIME_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Discharge", "instant": True},
{"refId": "C", "expr": ANANKE_UPS_ON_BATTERY_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Status", "instant": True},
{"refId": "D", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Draw (W)", "instant": True},
{"refId": "E", "expr": ANANKE_UPS_RUNTIME_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Discharge", "instant": True},
{"refId": "F", "expr": ANANKE_UPS_ON_BATTERY_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Status", "instant": True},
],
field_overrides=[
{
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Draw (W)"},
"properties": [{"id": "unit", "value": "watt"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}],
},
{
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Draw (W)"},
"properties": [{"id": "unit", "value": "watt"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}],
},
{
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Discharge"},
"properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}],
},
{
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Discharge"},
"properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}],
},
{
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Status"},
"properties": [{"id": "mappings", "value": status_mapping}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}],
},
{
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Status"},
"properties": [{"id": "mappings", "value": status_mapping}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}],
},
],
orientation="horizontal",
wide_layout=True,
description=(
"Per-UPS live snapshot: current draw in watts, estimated battery runtime if discharge started now, and charging/discharging status."
),
)
)
panels.append(
timeseries_panel(
2,
"UPS History (Power Draw)",
None,
{"h": 8, "w": 12, "x": 12, "y": 0},
unit="watt",
targets=[
{"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB_SERIES, "legendFormat": ANANKE_UPS_DB_NAME},
{"refId": "B", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES, "legendFormat": ANANKE_UPS_TETHYS_NAME},
],
legend_display="table",
legend_placement="right",
description="Historical UPS power consumption in watts for titan-db and tethys.",
)
)
panels.append(
stat_panel(
3,
"Current Climate",
None,
{"h": 8, "w": 12, "x": 0, "y": 8},
unit="none",
decimals=2,
text_mode="name_and_value",
targets=[
{"refId": "A", "expr": CLIMATE_TEMP_MAX, "legendFormat": "Tent Temp (°C)", "instant": True},
{"refId": "B", "expr": CLIMATE_PRESSURE_CURRENT, "legendFormat": "Tent VPD (kPa)", "instant": True},
{"refId": "C", "expr": CLIMATE_HUMIDITY_MAX, "legendFormat": "Tent RH (%)", "instant": True},
{"refId": "D", "expr": CLIMATE_DEWPOINT_CURRENT, "legendFormat": "Dew Point (°C)", "instant": True},
],
field_overrides=[
{"matcher": {"id": "byName", "options": "Tent Temp (°C)"}, "properties": [{"id": "unit", "value": "celsius"}]},
{"matcher": {"id": "byName", "options": "Tent VPD (kPa)"}, "properties": [{"id": "unit", "value": "suffix:kPa"}]},
{"matcher": {"id": "byName", "options": "Tent RH (%)"}, "properties": [{"id": "unit", "value": "percent"}]},
{"matcher": {"id": "byName", "options": "Dew Point (°C)"}, "properties": [{"id": "unit", "value": "celsius"}]},
],
orientation="horizontal",
wide_layout=True,
description="Current tent temperature, humidity, VPD, and dew point. These render once Typhon climate telemetry is online.",
)
)
panels.append(
timeseries_panel(
4,
"Climate History",
None,
{"h": 8, "w": 12, "x": 12, "y": 8},
unit="celsius",
targets=[
{"refId": "A", "expr": CLIMATE_TEMP_SERIES, "legendFormat": "Temperature (°C)"},
{"refId": "B", "expr": CLIMATE_HUMIDITY_SERIES, "legendFormat": "Humidity (%)"},
{"refId": "C", "expr": CLIMATE_PRESSURE_SERIES, "legendFormat": "VPD (kPa)"},
{"refId": "D", "expr": CLIMATE_DEWPOINT_SERIES, "legendFormat": "Dew Point (°C)"},
],
field_overrides=[
{
"matcher": {"id": "byName", "options": "Humidity (%)"},
"properties": [
{"id": "unit", "value": "percent"},
],
},
{
"matcher": {"id": "byName", "options": "VPD (kPa)"},
"properties": [
{"id": "unit", "value": "none"},
{"id": "custom.axisPlacement", "value": "right"},
{"id": "custom.axisLabel", "value": "kPa"},
{"id": "decimals", "value": 2},
],
}
],
legend_display="table",
legend_placement="right",
description="Two-axis chart: tent temperature/humidity/dew point (left axis) and tent VPD in kPa (right axis).",
)
)
panels.append(
stat_panel(
5,
"Fan Activity",
None,
{"h": 8, "w": 12, "x": 0, "y": 16},
unit="none",
decimals=0,
text_mode="name_and_value",
targets=[
{"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Inside Outlet", "instant": True},
{"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "Inside Inlet", "instant": True},
{"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Outside Inlet", "instant": True},
{"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior Fans", "instant": True},
],
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 7},
{"color": "red", "value": 9},
],
},
orientation="horizontal",
wide_layout=True,
description="Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans.",
)
)
panels.append(
timeseries_panel(
6,
"Fan History (0-10)",
None,
{"h": 8, "w": 12, "x": 12, "y": 16},
unit="none",
max_value=10,
targets=[
{"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Inside Outlet"},
{"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "Inside Inlet"},
{"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Outside Inlet"},
{"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior Fans"},
],
legend_display="table",
legend_placement="right",
description="Historical fan activity for all four fan groups (0-10 scale).",
)
)
return {
"uid": "atlas-power",
"title": "Atlas Power",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-24h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "power", "climate"],
}
def build_gpu_dashboard():
panels = []
gpu_scope = "$namespace_scope_gpu"
panels.append(
pie_panel(
1,
"Namespace GPU Share",
namespace_gpu_share_expr(gpu_scope),
{"h": 8, "w": 12, "x": 0, "y": 0},
links=namespace_scope_links("namespace_scope_gpu"),
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
)
)
panels.append(
timeseries_panel(
2,
"GPU Util by Namespace",
namespace_gpu_usage_instant(gpu_scope),
{"h": 8, "w": 12, "x": 12, "y": 0},
unit="percent",
legend="{{namespace}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
3,
"GPU Util by Node",
2026-01-27 21:43:37 -03:00
gpu_util_by_hostname(),
{"h": 8, "w": 12, "x": 0, "y": 8},
unit="percent",
legend="{{Hostname}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
table_panel(
4,
"Top Pods by GPU Util",
'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
{"h": 8, "w": 12, "x": 12, "y": 8},
unit="percent",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
return {
"uid": "atlas-gpu",
"title": "Atlas GPU",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "gpu"],
"templating": {
"list": [
namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
]
},
}
DASHBOARDS = {
"atlas-overview": {
"builder": build_overview,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml",
},
"atlas-pods": {
"builder": build_pods_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml",
},
"atlas-nodes": {
"builder": build_nodes_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml",
},
"atlas-storage": {
"builder": build_storage_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml",
},
2025-11-17 16:27:38 -03:00
"atlas-network": {
"builder": build_network_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
},
"atlas-mail": {
"builder": build_mail_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
},
2026-01-21 13:37:36 -03:00
"atlas-jobs": {
"builder": build_jobs_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
},
"atlas-power": {
"builder": build_power_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-power.yaml",
},
"atlas-gpu": {
"builder": build_gpu_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
},
}
2025-11-17 16:27:38 -03:00
def write_json(uid, data):
DASHBOARD_DIR.mkdir(parents=True, exist_ok=True)
path = DASHBOARD_DIR / f"{uid}.json"
path.write_text(json.dumps(data, indent=2) + "\n")
2025-11-17 16:27:38 -03:00
def render_configmap(uid, info):
json_path = DASHBOARD_DIR / f"{uid}.json"
payload = json.dumps(json.loads(json_path.read_text()), indent=2)
indented = "\n".join(" " + line for line in payload.splitlines())
2025-11-17 16:27:38 -03:00
output_path = info["configmap"]
content = CONFIG_TEMPLATE.format(
relative_path=output_path.relative_to(ROOT),
name=output_path.stem,
key=json_path.name,
payload=indented,
)
output_path.write_text(content)
print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}")
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders")
args = parser.parse_args()
if args.build:
for uid, info in DASHBOARDS.items():
write_json(uid, info["builder"]())
for uid, info in DASHBOARDS.items():
render_configmap(uid, info)
if __name__ == "__main__":
main()