2025-11-17 14:22:46 -03:00
#!/usr/bin/env python3
2025-11-17 16:27:38 -03:00
""" Generate Atlas Grafana dashboards and render them into ConfigMaps.
2025-11-17 14:22:46 -03:00
Usage :
2025-12-02 13:16:00 -03:00
scripts / dashboards_render_atlas . py - - build # rebuild JSON + ConfigMaps
scripts / dashboards_render_atlas . py # re-render ConfigMaps from JSON
2025-11-17 14:22:46 -03:00
"""
2025-11-17 16:27:38 -03:00
2025-11-17 14:22:46 -03:00
import argparse
import json
import textwrap
2026-01-01 14:44:33 -03:00
import urllib . parse
2025-11-17 14:22:46 -03:00
from pathlib import Path
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Paths, folders, and shared metadata
# ---------------------------------------------------------------------------
2025-11-17 14:22:46 -03:00
ROOT = Path ( __file__ ) . resolve ( ) . parents [ 1 ]
DASHBOARD_DIR = ROOT / " services " / " monitoring " / " dashboards "
CONFIG_TEMPLATE = textwrap . dedent (
""" # {relative_path}
apiVersion : v1
kind : ConfigMap
metadata :
name : { name }
labels :
grafana_dashboard : " 1 "
data :
{ key } : |
{ payload }
"""
)
PROM_DS = { " type " : " prometheus " , " uid " : " atlas-vm " }
2025-12-02 14:41:39 -03:00
PUBLIC_FOLDER = " overview "
2025-11-17 16:27:38 -03:00
PRIVATE_FOLDER = " atlas-internal "
2026-04-11 11:54:43 -03:00
ASTRAIOS_MOUNTPOINT = " /mnt/astraios "
2025-11-17 16:27:38 -03:00
PERCENT_THRESHOLDS = {
2025-12-12 21:13:31 -03:00
" mode " : " absolute " ,
2025-11-17 16:27:38 -03:00
" steps " : [
{ " color " : " green " , " value " : None } ,
2025-12-12 21:13:31 -03:00
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 75 } ,
{ " color " : " red " , " value " : 91.5 } ,
2025-11-17 16:27:38 -03:00
] ,
}
2026-01-05 13:30:33 -03:00
NAMESPACE_CPU_WINDOW = " 1m "
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Cluster metadata
# ---------------------------------------------------------------------------
CONTROL_PLANE_NODES = [ " titan-0a " , " titan-0b " , " titan-0c " ]
2026-01-06 09:50:40 -03:00
CONTROL_DEPENDENCIES = [ " titan-db " , " titan-jh " ]
2025-11-17 16:27:38 -03:00
CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
WORKER_NODES = [
" titan-04 " ,
" titan-05 " ,
" titan-06 " ,
" titan-07 " ,
" titan-08 " ,
" titan-09 " ,
" titan-10 " ,
" titan-11 " ,
2026-01-11 02:02:47 -03:00
" titan-20 " ,
" titan-21 " ,
2025-11-17 16:27:38 -03:00
" titan-12 " ,
" titan-13 " ,
" titan-14 " ,
" titan-15 " ,
2026-01-21 14:30:55 -03:00
" titan-16 " ,
2025-11-17 16:27:38 -03:00
" titan-17 " ,
" titan-18 " ,
" titan-19 " ,
" titan-22 " ,
" titan-24 " ,
]
CONTROL_REGEX = " | " . join ( CONTROL_PLANE_NODES )
CONTROL_ALL_REGEX = " | " . join ( CONTROL_ALL )
WORKER_REGEX = " | " . join ( WORKER_NODES )
CONTROL_TOTAL = len ( CONTROL_PLANE_NODES )
WORKER_TOTAL = len ( WORKER_NODES )
CONTROL_SUFFIX = f " / { CONTROL_TOTAL } "
WORKER_SUFFIX = f " / { WORKER_TOTAL } "
2026-01-11 23:46:24 -03:00
# Namespaces considered infrastructure (excluded from workload counts)
2026-01-18 02:50:07 -03:00
INFRA_PATTERNS = [
" kube-.* " ,
" .*-system " ,
" traefik " ,
2026-01-11 23:46:24 -03:00
" monitoring " ,
2026-01-11 23:52:40 -03:00
" logging " ,
2026-01-12 00:26:46 -03:00
" cert-manager " ,
2026-01-11 23:52:40 -03:00
" maintenance " ,
" postgres " ,
2026-01-11 23:46:24 -03:00
]
2026-01-18 02:50:07 -03:00
INFRA_REGEX = f " ^( { ' | ' . join ( INFRA_PATTERNS ) } )$ "
2026-01-11 23:46:24 -03:00
# Namespaces allowed on control plane without counting as workloads
CP_ALLOWED_NS = INFRA_REGEX
2025-11-17 18:55:11 -03:00
LONGHORN_NODE_REGEX = " titan-1[2-9]|titan-2[24] "
2025-12-12 15:23:51 -03:00
GAUGE_WIDTHS = [ 4 , 3 , 3 , 4 , 3 , 3 , 4 ]
2025-11-18 17:09:13 -03:00
CONTROL_WORKLOADS_EXPR = (
f ' sum(kube_pod_info {{ node=~ " { CONTROL_REGEX } " ,namespace!~ " { CP_ALLOWED_NS } " }} ) or on() vector(0) '
)
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# PromQL helpers
# ---------------------------------------------------------------------------
NODE_INFO = ' label_replace(node_uname_info { nodename!= " " }, " node " , " $1 " , " nodename " , " (.*) " ) '
def node_filter ( regex ) :
""" Return a selector that evaluates to 1 for nodes matching the regex. """
return (
f ' label_replace(node_uname_info {{ nodename=~ " { regex } " }} , '
' " node " , " $1 " , " nodename " , " (.*) " ) '
)
2025-11-17 14:22:46 -03:00
2025-11-17 16:27:38 -03:00
def scoped_node_expr ( base , scope = " " ) :
""" Attach nodename metadata and optionally filter to a scope regex. """
expr = f " avg by (node) (( { base } ) * on(instance) group_left(node) { NODE_INFO } ) "
if scope :
expr = f " ( { expr } ) * on(node) group_left() { node_filter ( scope ) } "
return expr
def node_cpu_expr ( scope = " " ) :
idle = ' avg by (instance) (rate(node_cpu_seconds_total { mode= " idle " }[5m])) '
base = f " (1 - { idle } ) * 100 "
return scoped_node_expr ( base , scope )
def node_mem_expr ( scope = " " ) :
usage = (
" avg by (instance) ( "
" (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) "
" / node_memory_MemTotal_bytes * 100) "
)
return scoped_node_expr ( usage , scope )
def filesystem_usage_expr ( mount , scope = " " ) :
base = (
f ' avg by (instance) ( '
f ' (1 - (node_filesystem_avail_bytes {{ mountpoint= " { mount } " ,fstype!~ " tmpfs|overlay " }} '
f ' / node_filesystem_size_bytes {{ mountpoint= " { mount } " ,fstype!~ " tmpfs|overlay " }} )) * 100) '
)
return scoped_node_expr ( base , scope )
def root_usage_expr ( scope = " " ) :
return filesystem_usage_expr ( " / " , scope )
2026-04-11 11:54:43 -03:00
def astraios_usage_expr ( scope = " " ) :
return filesystem_usage_expr ( ASTRAIOS_MOUNTPOINT , scope )
2025-11-17 16:27:38 -03:00
def astreae_usage_expr ( mount ) :
return (
f " 100 - (sum(node_filesystem_avail_bytes {{ mountpoint= \" { mount } \" ,fstype!~ \" tmpfs|overlay \" }} ) / "
f " sum(node_filesystem_size_bytes {{ mountpoint= \" { mount } \" ,fstype!~ \" tmpfs|overlay \" }} ) * 100) "
)
def astreae_free_expr ( mount ) :
return f " sum(node_filesystem_avail_bytes {{ mountpoint= \" { mount } \" ,fstype!~ \" tmpfs|overlay \" }} ) "
2025-11-17 20:19:20 -03:00
def topk_with_node ( expr ) :
2025-11-17 23:42:55 -03:00
return f ' label_replace(topk(1, { expr } ), " __name__ " , " $1 " , " node " , " (.*) " ) '
2025-11-17 20:19:20 -03:00
2025-11-17 20:14:11 -03:00
def node_net_expr ( scope = " " ) :
base = (
' sum by (instance) ( '
2025-11-17 21:20:19 -03:00
' rate(node_network_receive_bytes_total { device!~ " lo " }[5m]) '
' + rate(node_network_transmit_bytes_total { device!~ " lo " }[5m])) '
2025-11-17 20:14:11 -03:00
)
return scoped_node_expr ( base , scope )
def node_io_expr ( scope = " " ) :
base = (
" sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
" + rate(node_disk_written_bytes_total[5m])) "
)
return scoped_node_expr ( base , scope )
2026-01-01 14:44:33 -03:00
def namespace_selector ( scope_var ) :
2026-01-05 13:30:33 -03:00
return f ' namespace!= " " ,pod!= " " ,container!= " " ,container!= " POD " , { scope_var } '
2026-01-01 14:44:33 -03:00
def namespace_gpu_selector ( scope_var ) :
return f ' namespace!= " " ,pod!= " " , { scope_var } '
def namespace_cpu_raw ( scope_var ) :
2026-01-05 13:30:33 -03:00
return (
" sum(rate(container_cpu_usage_seconds_total "
f " {{ { namespace_selector ( scope_var ) } }} [ { NAMESPACE_CPU_WINDOW } ])) by (namespace) "
)
2026-01-01 14:44:33 -03:00
def namespace_ram_raw ( scope_var ) :
return f " sum(container_memory_working_set_bytes {{ { namespace_selector ( scope_var ) } }} ) by (namespace) "
def namespace_gpu_usage_instant ( scope_var ) :
2026-01-27 21:43:37 -03:00
return gpu_usage_by_namespace ( scope_var )
2026-01-26 22:26:24 -03:00
def jetson_gpu_util_by_node ( ) :
return ' max by (node) (jetson_gr3d_freq_percent { node!= " " }) '
2026-01-27 21:43:37 -03:00
def dcgm_gpu_util_by_node ( ) :
dcgm_pod = ' label_replace(DCGM_FI_DEV_GPU_UTIL, " pod " , " $1 " , " Hostname " , " (.*) " ) '
dcgm_ns = ' label_replace( ' + dcgm_pod + ' , " namespace " , " monitoring " , " " , " " ) '
2026-01-27 16:19:30 -03:00
return (
2026-01-27 21:43:37 -03:00
" avg by (node) ( "
f " { dcgm_ns } * on(namespace,pod) group_left(node) "
' kube_pod_info { namespace= " monitoring " } '
" ) "
2026-01-27 16:19:30 -03:00
)
2026-01-27 21:43:37 -03:00
def gpu_util_by_node ( ) :
return f " { dcgm_gpu_util_by_node ( ) } or { jetson_gpu_util_by_node ( ) } "
def gpu_util_by_hostname ( ) :
return ' label_replace( ' + gpu_util_by_node ( ) + ' , " Hostname " , " $1 " , " node " , " (.*) " ) '
def gpu_node_labels ( ) :
return ' kube_node_labels { label_accelerator=~ " .+ " } or kube_node_labels { label_jetson= " true " } '
def gpu_requests_by_namespace_node ( scope_var ) :
2026-01-26 22:26:24 -03:00
return (
" sum by (namespace,node) ( "
f ' kube_pod_container_resource_requests {{ resource=~ " nvidia.com/gpu.* " , { scope_var } }} '
" * on(namespace,pod) group_left(node) kube_pod_info "
2026-01-27 21:46:58 -03:00
f " * on(node) group_left() ( { gpu_node_labels ( ) } ) "
2026-01-27 21:43:37 -03:00
" ) "
)
def gpu_usage_by_namespace ( scope_var ) :
requests_by_ns = gpu_requests_by_namespace_node ( scope_var )
total_by_node = f " sum by (node) ( { requests_by_ns } ) "
return (
" sum by (namespace) ( "
f " ( { requests_by_ns } ) / clamp_min( { total_by_node } , 1) "
2026-01-27 21:46:58 -03:00
f " * on(node) group_left() ( { gpu_util_by_node ( ) } ) "
2026-01-26 22:26:24 -03:00
" ) "
)
def jetson_gpu_usage_by_namespace ( scope_var ) :
requests_by_ns = jetson_gpu_requests ( scope_var )
total_by_node = f " sum by (node) ( { requests_by_ns } ) "
return (
" sum by (namespace) ( "
f " ( { requests_by_ns } ) / clamp_min( { total_by_node } , 1) "
f " * on(node) group_left() { jetson_gpu_util_by_node ( ) } "
" ) "
)
2026-01-01 14:44:33 -03:00
2025-11-18 14:08:33 -03:00
def namespace_share_expr ( resource_expr ) :
2026-01-01 14:16:08 -03:00
total = f " clamp_min(sum( { resource_expr } ), 1) "
return f " 100 * ( { resource_expr } ) / { total } "
2025-11-17 21:57:40 -03:00
2026-01-01 14:44:33 -03:00
def namespace_cpu_share_expr ( scope_var ) :
return namespace_share_expr ( namespace_cpu_raw ( scope_var ) )
2025-11-18 14:08:33 -03:00
2026-01-01 14:44:33 -03:00
def namespace_ram_share_expr ( scope_var ) :
return namespace_share_expr ( namespace_ram_raw ( scope_var ) )
2025-11-18 00:11:39 -03:00
2026-01-01 14:44:33 -03:00
def namespace_gpu_share_expr ( scope_var ) :
usage = namespace_gpu_usage_instant ( scope_var )
total = f " (sum( { usage } ) or on() vector(0)) "
share = f " 100 * ( { usage } ) / clamp_min( { total } , 1) "
2026-01-27 18:44:58 -03:00
idle = ' label_replace(vector(100), " namespace " , " idle " , " " , " " ) * scalar( ' + total + " == bool 0) "
2026-01-01 14:21:43 -03:00
return f " ( { share } ) or ( { idle } ) "
2025-11-17 23:12:16 -03:00
2025-12-12 20:30:00 -03:00
PROBLEM_PODS_EXPR = (
' sum(max by (namespace,pod) (kube_pod_status_phase { phase!~ " Running|Succeeded " })) '
" or on() vector(0) "
)
2025-11-17 16:27:38 -03:00
CRASHLOOP_EXPR = (
' sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason '
2025-12-12 20:30:00 -03:00
' { reason=~ " CrashLoopBackOff|ImagePullBackOff " })) '
" or on() vector(0) "
2025-11-17 16:27:38 -03:00
)
STUCK_TERMINATING_EXPR = (
2025-11-17 18:55:11 -03:00
' sum(max by (namespace,pod) ( '
' ((time() - kube_pod_deletion_timestamp { pod!= " " }) > bool 600) '
' and on(namespace,pod) (kube_pod_deletion_timestamp { pod!= " " } > bool 0) '
2025-12-12 20:30:00 -03:00
' )) '
" or on() vector(0) "
2025-11-17 16:27:38 -03:00
)
2025-12-19 13:46:34 -03:00
UPTIME_WINDOW = " 365d "
2025-12-19 14:56:29 -03:00
# Keep the subquery step coarse so we don't request an excessive number of points.
UPTIME_STEP = " 1h "
2025-12-12 15:56:33 -03:00
TRAEFIK_READY_EXPR = (
" ( "
' sum(kube_deployment_status_replicas_available { namespace=~ " traefik|kube-system " ,deployment= " traefik " }) '
" / clamp_min( "
' sum(kube_deployment_spec_replicas { namespace=~ " traefik|kube-system " ,deployment= " traefik " }), 1) '
" ) "
)
CONTROL_READY_FRACTION_EXPR = (
f " (sum(kube_node_status_condition {{ condition= \" Ready \" ,status= \" true \" ,node=~ \" { CONTROL_REGEX } \" }} ) "
f " / { CONTROL_TOTAL } ) "
)
UPTIME_AVAIL_EXPR = (
f " min(( { CONTROL_READY_FRACTION_EXPR } ), ( { TRAEFIK_READY_EXPR } )) "
)
2025-12-13 15:51:45 -03:00
# Tie-breaker to deterministically pick one node per namespace when shares tie.
NODE_TIEBREAKER = " + " . join (
f " ( { node_filter ( node ) } ) * 1e-6 * { idx } "
for idx , node in enumerate ( CONTROL_ALL + WORKER_NODES , start = 1 )
)
2025-12-19 14:56:29 -03:00
UPTIME_AVG_EXPR = f " avg_over_time(( { UPTIME_AVAIL_EXPR } )[ { UPTIME_WINDOW } : { UPTIME_STEP } ]) "
2025-12-12 16:36:47 -03:00
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
2025-12-12 15:56:33 -03:00
UPTIME_NINES_EXPR = f " -log10(1 - clamp_max( { UPTIME_AVG_EXPR } , 0.999999999)) "
2025-12-12 15:23:51 -03:00
UPTIME_THRESHOLDS = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " yellow " , " value " : 3 } ,
{ " color " : " green " , " value " : 3.5 } ,
] ,
}
2025-12-12 16:11:28 -03:00
UPTIME_PERCENT_THRESHOLDS = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
2025-12-15 22:14:26 -03:00
{ " color " : " orange " , " value " : 0.99 } ,
{ " color " : " yellow " , " value " : 0.999 } ,
{ " color " : " green " , " value " : 0.9999 } ,
{ " color " : " blue " , " value " : 0.99999 } ,
2025-12-12 16:11:28 -03:00
] ,
}
2025-11-17 16:27:38 -03:00
PROBLEM_TABLE_EXPR = (
" (time() - kube_pod_created { pod!= \" \" }) "
" * on(namespace,pod) group_left(node) kube_pod_info "
" * on(namespace,pod) group_left(phase) "
" max by (namespace,pod,phase) (kube_pod_status_phase { phase!~ \" Running|Succeeded \" }) "
)
CRASHLOOP_TABLE_EXPR = (
" (time() - kube_pod_created { pod!= \" \" }) "
" * on(namespace,pod) group_left(node) kube_pod_info "
" * on(namespace,pod,container) group_left(reason) "
" max by (namespace,pod,container,reason) "
" (kube_pod_container_status_waiting_reason { reason=~ \" CrashLoopBackOff|ImagePullBackOff \" }) "
)
STUCK_TABLE_EXPR = (
2025-11-17 18:55:11 -03:00
" ( "
2025-11-17 16:27:38 -03:00
" ((time() - kube_pod_deletion_timestamp { pod!= \" \" }) "
2025-11-17 18:55:11 -03:00
" and on(namespace,pod) (kube_pod_deletion_timestamp { pod!= \" \" } > bool 0)) "
" * on(namespace,pod) group_left(node) kube_pod_info "
" ) "
2025-11-17 16:27:38 -03:00
)
2026-01-11 23:46:24 -03:00
NAMESPACE_SCOPE_WORKLOAD = f ' namespace!~ " { INFRA_REGEX } " '
2026-01-01 14:16:08 -03:00
NAMESPACE_SCOPE_ALL = ' namespace=~ " .* " '
2026-01-11 23:46:24 -03:00
NAMESPACE_SCOPE_INFRA = f ' namespace=~ " { INFRA_REGEX } " '
2026-01-01 14:44:33 -03:00
NAMESPACE_SCOPE_VARS = [ " namespace_scope_cpu " , " namespace_scope_gpu " , " namespace_scope_ram " ]
2026-01-18 02:50:07 -03:00
GLUE_LABEL = ' label_atlas_bstein_dev_glue= " true " '
GLUE_JOBS = f " kube_cronjob_labels {{ { GLUE_LABEL } }} "
2026-01-18 12:26:04 -03:00
GLUE_FILTER = f " and on(namespace,cronjob) { GLUE_JOBS } "
GLUE_LAST_SUCCESS = f " (kube_cronjob_status_last_successful_time { GLUE_FILTER } ) "
GLUE_LAST_SCHEDULE = f " (kube_cronjob_status_last_schedule_time { GLUE_FILTER } ) "
GLUE_SUSPENDED = f " (kube_cronjob_spec_suspend { GLUE_FILTER } ) == 1 "
GLUE_ACTIVE = f " (kube_cronjob_status_active { GLUE_FILTER } ) "
2026-01-18 02:50:07 -03:00
GLUE_LAST_SUCCESS_AGE = f " (time() - { GLUE_LAST_SUCCESS } ) "
GLUE_LAST_SCHEDULE_AGE = f " (time() - { GLUE_LAST_SCHEDULE } ) "
GLUE_LAST_SUCCESS_AGE_HOURS = f " ( { GLUE_LAST_SUCCESS_AGE } ) / 3600 "
GLUE_LAST_SCHEDULE_AGE_HOURS = f " ( { GLUE_LAST_SCHEDULE_AGE } ) / 3600 "
GLUE_STALE_WINDOW_SEC = 36 * 3600
GLUE_STALE = f " ( { GLUE_LAST_SUCCESS_AGE } > bool { GLUE_STALE_WINDOW_SEC } ) "
2026-01-18 12:26:04 -03:00
GLUE_MISSING = f " ( { GLUE_JOBS } unless on(namespace,cronjob) kube_cronjob_status_last_successful_time) "
2026-01-18 02:50:07 -03:00
GLUE_STALE_ACTIVE = f " ( { GLUE_STALE } unless on(namespace,cronjob) { GLUE_SUSPENDED } ) "
GLUE_MISSING_ACTIVE = f " ( { GLUE_MISSING } unless on(namespace,cronjob) { GLUE_SUSPENDED } ) "
2026-01-21 14:30:55 -03:00
GLUE_STALE_COUNT = f " (sum( { GLUE_STALE_ACTIVE } ) + count( { GLUE_MISSING_ACTIVE } )) or on() vector(0) "
GLUE_MISSING_COUNT = f " count( { GLUE_MISSING_ACTIVE } ) or on() vector(0) "
GLUE_SUSPENDED_COUNT = f " sum( { GLUE_SUSPENDED } ) or on() vector(0) "
ARIADNE_TASK_ERRORS_RANGE = ' sum by (task) (increase(ariadne_task_runs_total { status= " error " }[$__range])) '
2026-01-19 16:58:02 -03:00
ARIADNE_TASK_ERRORS_24H = ' sum by (task) (increase(ariadne_task_runs_total { status= " error " }[24h])) '
2026-01-21 13:37:36 -03:00
ARIADNE_TASK_ERRORS_1H = ' sum by (task) (increase(ariadne_task_runs_total { status= " error " }[1h])) '
ARIADNE_TASK_ERRORS_30D = ' sum by (task) (increase(ariadne_task_runs_total { status= " error " }[30d])) '
2026-01-19 16:58:02 -03:00
ARIADNE_TASK_SUCCESS_24H = ' sum by (task) (increase(ariadne_task_runs_total { status= " ok " }[24h])) '
2026-01-21 02:57:40 -03:00
ARIADNE_TASK_RUNS_BY_STATUS_1H = ' sum by (status) (increase(ariadne_task_runs_total[1h])) '
2026-01-21 11:29:29 -03:00
ARIADNE_TASK_ERRORS_1H_TOTAL = ' sum(increase(ariadne_task_runs_total { status= " error " }[1h])) '
ARIADNE_TASK_ERRORS_24H_TOTAL = ' sum(increase(ariadne_task_runs_total { status= " error " }[24h])) '
ARIADNE_TASK_RUNS_1H_TOTAL = ' sum(increase(ariadne_task_runs_total[1h])) '
2026-01-21 14:30:55 -03:00
ARIADNE_TASK_ATTEMPTS_SERIES = ' sum(increase(ariadne_task_runs_total[$__interval])) '
ARIADNE_TASK_FAILURES_SERIES = ' sum(increase(ariadne_task_runs_total { status= " error " }[$__interval])) '
ARIADNE_TASK_WARNINGS_SERIES = (
' sum(increase(ariadne_task_runs_total { status!~ " ok|error " }[$__interval])) or on() vector(0) '
)
2026-04-12 17:29:18 -03:00
ARIADNE_SCHEDULE_TASK_FILTER = ' task=~ " ^schedule \\ ..+$ " '
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = (
f " (time() - ariadne_schedule_last_success_timestamp_seconds {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} ) / 3600 "
)
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = (
f " (time() - ariadne_schedule_last_error_timestamp_seconds {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} ) / 3600 "
)
2026-01-21 14:30:55 -03:00
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
2026-04-12 17:29:18 -03:00
f " (time() - max_over_time(ariadne_schedule_last_success_timestamp_seconds {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} [$__range])) / 3600 "
2026-01-21 14:30:55 -03:00
)
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
2026-04-12 17:29:18 -03:00
f " (time() - max_over_time(ariadne_schedule_last_error_timestamp_seconds {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} [$__range])) / 3600 "
)
2026-04-12 20:05:39 -03:00
ARIADNE_SCHEDULE_NEXT_RUN_HOURS = (
f " ((ariadne_schedule_next_run_timestamp_seconds {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} - time()) / 3600) "
)
2026-04-12 20:09:43 -03:00
ARIADNE_SCHEDULE_TASK_INDEX = f " ariadne_schedule_next_run_timestamp_seconds {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} "
2026-04-12 17:29:18 -03:00
ARIADNE_SCHEDULE_LAST_STATUS = f " ariadne_schedule_last_status {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} "
2026-04-12 20:05:39 -03:00
ARIADNE_SCHEDULE_SIGNAL_COUNT = (
f " count(ariadne_schedule_last_success_timestamp_seconds {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} ) or on() vector(0) "
)
ARIADNE_SCHEDULE_STALE_WINDOW_SEC = 36 * 3600
ARIADNE_SCHEDULE_STALE_COUNT = (
f " sum(((time() - ariadne_schedule_last_success_timestamp_seconds {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} ) > bool { ARIADNE_SCHEDULE_STALE_WINDOW_SEC } )) "
" or on() vector(0) "
)
ARIADNE_SCHEDULE_MISSING_SUCCESS_COUNT = (
f " count((ariadne_schedule_next_run_timestamp_seconds {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} unless on(task) "
f " ariadne_schedule_last_success_timestamp_seconds {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} )) or on() vector(0) "
)
ARIADNE_SCHEDULE_FAILED_LAST_COUNT = (
f " sum(((1 - ariadne_schedule_last_status {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} ) > bool 0)) or on() vector(0) "
)
2026-04-12 17:29:18 -03:00
ARIADNE_SCHEDULE_RUNS_RANGE = (
f ' sum by (task) (increase(ariadne_task_runs_total {{ { ARIADNE_SCHEDULE_TASK_FILTER } }} [$__range])) '
)
ARIADNE_SCHEDULE_ERRORS_RANGE = (
f ' sum by (task) (increase(ariadne_task_runs_total {{ status= " error " , { ARIADNE_SCHEDULE_TASK_FILTER } }} [$__range])) '
2026-01-21 14:30:55 -03:00
)
2026-04-12 20:09:43 -03:00
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS_FALLBACK = (
f " ( { ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS } ) or on(task) (0 * { ARIADNE_SCHEDULE_TASK_INDEX } + 999) "
)
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS_FALLBACK = (
f " ( { ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS } ) or on(task) (0 * { ARIADNE_SCHEDULE_TASK_INDEX } + 999) "
)
ARIADNE_SCHEDULE_LAST_STATUS_FALLBACK = (
f " ( { ARIADNE_SCHEDULE_LAST_STATUS } ) or on(task) (0 * { ARIADNE_SCHEDULE_TASK_INDEX } - 1) "
)
ARIADNE_SCHEDULE_RUNS_RANGE_FALLBACK = (
f " ( { ARIADNE_SCHEDULE_RUNS_RANGE } ) or on(task) (0 * { ARIADNE_SCHEDULE_TASK_INDEX } ) "
)
ARIADNE_SCHEDULE_ERRORS_RANGE_FALLBACK = (
f " ( { ARIADNE_SCHEDULE_ERRORS_RANGE } ) or on(task) (0 * { ARIADNE_SCHEDULE_TASK_INDEX } ) "
)
2026-04-12 20:05:39 -03:00
JENKINS_CLEANUP_SIGNAL_COUNT = (
" count(ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds) or on() vector(0) "
)
JENKINS_CLEANUP_RUNS_RANGE = (
" sum by (mode, status) (increase(ariadne_jenkins_workspace_cleanup_runs_total[$__range])) "
)
JENKINS_CLEANUP_OBJECTS_RANGE = (
" sum by (kind, action, mode) (increase(ariadne_jenkins_workspace_cleanup_objects_total[$__range])) "
)
JENKINS_CLEANUP_LAST_RUN_AGE_HOURS = (
" ((time() - ariadne_jenkins_workspace_cleanup_last_run_timestamp_seconds) / 3600) or on() vector(999) "
)
JENKINS_CLEANUP_LAST_SUCCESS_AGE_HOURS = (
" ((time() - ariadne_jenkins_workspace_cleanup_last_success_timestamp_seconds) / 3600) or on() vector(999) "
)
JENKINS_CLEANUP_LAST_DELETED = " ariadne_jenkins_workspace_cleanup_last_deleted_total or on() vector(0) "
JENKINS_CLEANUP_LAST_PLANNED = " ariadne_jenkins_workspace_cleanup_last_planned_total or on() vector(0) "
2026-04-13 00:25:33 -03:00
JENKINS_BUILD_WEATHER_LAST_STATUS = " ariadne_jenkins_build_weather_job_last_status "
JENKINS_BUILD_WEATHER_LAST_RUN_AGE_HOURS = (
" (time() - ariadne_jenkins_build_weather_job_last_run_timestamp_seconds) / 3600 "
)
JENKINS_BUILD_WEATHER_LAST_SUCCESS_AGE_HOURS = (
" (time() - ariadne_jenkins_build_weather_job_last_success_timestamp_seconds) / 3600 "
)
JENKINS_BUILD_WEATHER_LAST_FAILURE_AGE_HOURS = (
" (time() - ariadne_jenkins_build_weather_job_last_failure_timestamp_seconds) / 3600 "
)
JENKINS_BUILD_WEATHER_LAST_DURATION_MINUTES = (
" ariadne_jenkins_build_weather_job_last_duration_seconds / 60 "
)
2026-04-12 20:05:39 -03:00
JENKINS_WORKSPACE_PV_STALE_COUNT = (
' sum((kube_persistentvolume_status_phase { phase=~ " Released|Failed " } > bool 0) '
' * on(persistentvolume) group_left(claim_namespace,name) '
' kube_persistentvolume_claim_ref { claim_namespace= " jenkins " ,name=~ " pvc-workspace-.* " }) or on() vector(0) '
)
JENKINS_WORKSPACE_PV_STALE_AGE_HOURS = (
' ((time() - kube_persistentvolume_created) / 3600) '
' * on(persistentvolume) group_left(claim_namespace,name) '
' kube_persistentvolume_claim_ref { claim_namespace= " jenkins " ,name=~ " pvc-workspace-.* " } '
' * on(persistentvolume) group_left() (kube_persistentvolume_status_phase { phase=~ " Released|Failed " } > bool 0) '
)
2026-01-19 16:58:02 -03:00
ARIADNE_ACCESS_REQUESTS = " ariadne_access_requests_total "
2026-04-10 15:35:20 -03:00
PLATFORM_TEST_SUITE_NAMES = [
" ariadne " ,
" metis " ,
" ananke " ,
" atlasbot " ,
" lesavka " ,
" pegasus " ,
" soteria " ,
" titan-iac " ,
" bstein-home " ,
" arcanagon " ,
" data-prepper " ,
]
PLATFORM_TEST_SUITE_MATCHER = " | " . join ( PLATFORM_TEST_SUITE_NAMES )
2026-04-04 01:33:15 -03:00
PLATFORM_TEST_SUCCESS_EVENTS_30D = (
2026-04-10 15:35:20 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status=~ " ok|passed|success " }} [30d])) or on() vector(0)) '
2026-04-04 01:33:15 -03:00
)
PLATFORM_TEST_TOTAL_EVENTS_30D = (
2026-04-10 15:35:20 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " }} [30d])) or on() vector(0)) '
2026-04-04 01:33:15 -03:00
)
2026-03-31 13:54:04 -03:00
TEST_SUCCESS_RATE = (
2026-04-04 01:33:15 -03:00
f " 100 * ( { PLATFORM_TEST_SUCCESS_EVENTS_30D } ) / clamp_min(( { PLATFORM_TEST_TOTAL_EVENTS_30D } ), 1) "
2026-01-21 13:37:36 -03:00
)
2026-04-04 01:33:15 -03:00
TEST_FAILURES_24H_TOTAL = (
2026-04-10 15:35:20 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status!~ " ok|passed|success " }} [24h])) or on() vector(0)) '
2026-04-04 01:33:15 -03:00
)
2026-04-09 19:27:48 -03:00
PLATFORM_TEST_FAILURES_24H_BY_SUITE = (
2026-04-10 15:35:20 -03:00
f ' sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status!~ " ok|passed|success " }} [24h]))) '
2026-04-09 19:27:48 -03:00
)
2026-04-04 01:33:15 -03:00
PLATFORM_TEST_ACTIVITY_30D = (
2026-04-10 15:35:20 -03:00
f ' sum by (suite, status) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " }} [30d])) '
2026-01-21 13:37:36 -03:00
)
2026-04-09 19:27:48 -03:00
PLATFORM_TEST_POINT_WINDOW = " 1h "
2026-04-10 15:35:20 -03:00
PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS = [
2026-04-09 16:16:35 -03:00
{
2026-04-10 15:35:20 -03:00
" refId " : chr ( ord ( " A " ) + index ) ,
2026-04-09 16:16:35 -03:00
" expr " : (
2026-04-10 15:35:20 -03:00
f ' (100 * (sum(increase(platform_quality_gate_runs_total {{ suite= " { suite } " ,status=~ " ok|passed|success " }} '
2026-04-09 16:35:14 -03:00
f ' [ { PLATFORM_TEST_POINT_WINDOW } ]))) / '
2026-04-10 15:35:20 -03:00
f ' clamp_min((sum(increase(platform_quality_gate_runs_total {{ suite= " { suite } " }} [ { PLATFORM_TEST_POINT_WINDOW } ]))), 1)) '
f ' and on() ((sum(increase(platform_quality_gate_runs_total {{ suite= " { suite } " }} [ { PLATFORM_TEST_POINT_WINDOW } ]))) > 0) '
2026-04-09 16:16:35 -03:00
) ,
" legendFormat " : suite ,
}
2026-04-10 15:35:20 -03:00
for index , suite in enumerate ( PLATFORM_TEST_SUITE_NAMES )
2026-04-09 16:16:35 -03:00
]
2026-04-09 20:16:44 -03:00
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
2026-04-10 15:35:20 -03:00
f ' sort_desc((100 * (sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status=~ " ok|passed|success " }} [24h]))) '
f ' / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " }} [24h]))), 1)) '
f ' and on(suite) ((sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " }} [24h]))) > 0)) '
2026-04-09 20:16:44 -03:00
)
2026-04-12 22:58:21 -03:00
QUALITY_GATE_SUITE_INDEX_30D = (
f ' sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " }} [30d])) '
)
QUALITY_GATE_COVERAGE_BY_SUITE = (
' (max by (suite) ( { __name__=~ " .*_quality_gate_coverage_percent " })) '
' or on(suite) (max by (suite) (platform_quality_gate_workspace_line_coverage_percent)) '
)
QUALITY_GATE_COVERAGE_BY_SUITE_WITH_MISSING = (
f " ( { QUALITY_GATE_COVERAGE_BY_SUITE } ) or on(suite) (0 * ( { QUALITY_GATE_SUITE_INDEX_30D } ) - 1) "
)
QUALITY_GATE_COVERAGE_GAP_BY_SUITE = (
f " clamp_min(95 - ( { QUALITY_GATE_COVERAGE_BY_SUITE } ), 0) "
)
QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE = (
" max by (suite) (platform_quality_gate_source_lines_over_500_total) "
)
QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE_WITH_MISSING = (
f " ( { QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE } ) or on(suite) (0 * ( { QUALITY_GATE_SUITE_INDEX_30D } ) - 1) "
)
2026-04-13 05:33:28 -03:00
PVC_BACKUP_AGE_HOURS_BY_PVC = (
' sort_desc(max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds { driver= " restic " }) / 3600) '
' or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason { driver= " restic " ,reason=~ " missing|no_completed|lookup_failed|unknown_timestamp " } > 0) '
' * (pvc_backup_count { driver= " restic " } > bool 0)) * 999)))) '
)
2026-04-08 23:33:17 -03:00
ANANKE_SELECTOR = ' job= " ananke-power " '
ANANKE_UPS_DB_NAME = " Pyrphoros "
ANANKE_UPS_DB_NODE = " titan-db "
ANANKE_UPS_TETHYS_NAME = " Statera "
ANANKE_UPS_TETHYS_NODE = " titan-24 "
ANANKE_UPS_DB_SELECTOR = f ' { ANANKE_SELECTOR } ,source= " { ANANKE_UPS_DB_NAME } " '
ANANKE_UPS_TETHYS_SELECTOR = f ' { ANANKE_SELECTOR } ,source= " { ANANKE_UPS_TETHYS_NAME } " '
ANANKE_UPS_ON_BATTERY = f " sum(ananke_ups_on_battery {{ { ANANKE_SELECTOR } }} ) or on() vector(0) "
ANANKE_UPS_LOW_BATTERY = f " sum(ananke_ups_low_battery {{ { ANANKE_SELECTOR } }} ) or on() vector(0) "
ANANKE_UPS_RUNTIME_MIN = f " min(ananke_ups_runtime_seconds {{ { ANANKE_SELECTOR } }} ) or on() vector(0) "
ANANKE_UPS_RUNTIME_HEADROOM_PERCENT = (
f " 100 * min(ananke_ups_runtime_seconds {{ { ANANKE_SELECTOR } }} ) / "
f " clamp_min(max(ananke_ups_threshold_seconds {{ { ANANKE_SELECTOR } }} ), 1) "
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_TRIGGER_COUNT_1D = f " increase(ananke_shutdown_triggers_total {{ { ANANKE_SELECTOR } }} [1d]) or on() vector(0) "
ANANKE_UPS_RUNTIME_DB = (
f ' max(ananke_ups_runtime_seconds {{ { ANANKE_UPS_DB_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_RUNTIME_TETHYS = (
f ' max(ananke_ups_runtime_seconds {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_ON_BATTERY_DB = (
f ' max(ananke_ups_on_battery {{ { ANANKE_UPS_DB_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_ON_BATTERY_TETHYS = (
f ' max(ananke_ups_on_battery {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-13 01:08:58 -03:00
ANANKE_UPS_THRESHOLD_DB = (
f ' clamp_min(max(ananke_ups_threshold_seconds {{ { ANANKE_UPS_DB_SELECTOR } }} ), 1) '
)
ANANKE_UPS_THRESHOLD_TETHYS = (
f ' clamp_min(max(ananke_ups_threshold_seconds {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ), 1) '
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_BATTERY_CHARGE_DB = (
f ' max(ananke_ups_battery_charge_percent {{ { ANANKE_UPS_DB_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_BATTERY_CHARGE_TETHYS = (
f ' max(ananke_ups_battery_charge_percent {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_LOAD_DB = (
f ' max(ananke_ups_load_percent {{ { ANANKE_UPS_DB_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_LOAD_TETHYS = (
f ' max(ananke_ups_load_percent {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_DRAW_WATTS_DB = (
f ' max((ananke_ups_load_percent {{ { ANANKE_UPS_DB_SELECTOR } }} '
f ' * ananke_ups_power_nominal_watts {{ { ANANKE_UPS_DB_SELECTOR } }} ) / 100) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_DRAW_WATTS_TETHYS = (
f ' max((ananke_ups_load_percent {{ { ANANKE_UPS_TETHYS_SELECTOR } }} '
f ' * ananke_ups_power_nominal_watts {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) / 100) or on() vector(0) '
)
ANANKE_UPS_DRAW_WATTS_TOTAL = (
f ' sum((ananke_ups_load_percent {{ { ANANKE_SELECTOR } }} * ananke_ups_power_nominal_watts {{ { ANANKE_SELECTOR } }} ) / 100) '
2026-04-03 20:45:40 -03:00
" or on() vector(0) "
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_DRAW_WATTS_DB_SERIES = (
f ' ((ananke_ups_load_percent {{ { ANANKE_UPS_DB_SELECTOR } }} '
f ' * ananke_ups_power_nominal_watts {{ { ANANKE_UPS_DB_SELECTOR } }} ) / 100) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES = (
f ' ((ananke_ups_load_percent {{ { ANANKE_UPS_TETHYS_SELECTOR } }} '
f ' * ananke_ups_power_nominal_watts {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) / 100) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_DRAW_WATTS_TOTAL_SERIES = (
f ' sum((ananke_ups_load_percent {{ { ANANKE_SELECTOR } }} * ananke_ups_power_nominal_watts {{ { ANANKE_SELECTOR } }} ) / 100) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_RUNTIME_BY_SOURCE = f " ananke_ups_runtime_seconds {{ { ANANKE_SELECTOR } }} "
ANANKE_UPS_LOAD_BY_SOURCE = f " ananke_ups_load_percent {{ { ANANKE_SELECTOR } }} "
ANANKE_UPS_CHARGE_BY_SOURCE = f " ananke_ups_battery_charge_percent {{ { ANANKE_SELECTOR } }} "
ANANKE_UPS_TRIGGER_BY_SOURCE = f " ananke_ups_trigger_active {{ { ANANKE_SELECTOR } }} "
2026-04-13 01:08:58 -03:00
def ups_discharge_risk_expr ( on_battery_expr , runtime_expr , shutdown_threshold_expr ) :
return (
f " ((( { on_battery_expr } ) > bool 0) * ( "
f " 1 + (( { runtime_expr } ) < bool (3 * ( { shutdown_threshold_expr } ))) + "
f " (( { runtime_expr } ) < bool (2 * ( { shutdown_threshold_expr } ))) "
f " )) or on() vector(0) "
)
ANANKE_UPS_DISCHARGE_RISK_DB = ups_discharge_risk_expr (
ANANKE_UPS_ON_BATTERY_DB ,
ANANKE_UPS_RUNTIME_DB ,
ANANKE_UPS_THRESHOLD_DB ,
)
ANANKE_UPS_DISCHARGE_RISK_TETHYS = ups_discharge_risk_expr (
ANANKE_UPS_ON_BATTERY_TETHYS ,
ANANKE_UPS_RUNTIME_TETHYS ,
ANANKE_UPS_THRESHOLD_TETHYS ,
)
2026-04-12 17:20:05 -03:00
CLIMATE_SENSOR_COUNT = " count(typhon_temperature_celsius) or on() vector(0) "
2026-04-12 17:56:54 -03:00
CLIMATE_DEDUP_LABELS = " job,instance,pod,service,endpoint,namespace "
2026-04-13 01:43:21 -03:00
CLIMATE_TEMP_SERIES = (
2026-04-13 02:26:09 -03:00
f " max without ( { CLIMATE_DEDUP_LABELS } ) (typhon_temperature_celsius != 0) "
2026-04-13 01:43:21 -03:00
)
2026-04-12 17:56:54 -03:00
CLIMATE_TEMP_FAHRENHEIT_SERIES = f " ( { CLIMATE_TEMP_SERIES } ) * 9 / 5 + 32 "
2026-04-13 01:43:21 -03:00
CLIMATE_PRESSURE_SERIES = (
2026-04-13 02:26:09 -03:00
f " max without ( { CLIMATE_DEDUP_LABELS } ) (typhon_vpd_kpa != 0) "
2026-04-13 01:43:21 -03:00
)
2026-04-13 02:26:09 -03:00
CLIMATE_HUMIDITY_SERIES = (
2026-04-12 23:02:03 -03:00
f " max without ( { CLIMATE_DEDUP_LABELS } ) (typhon_relative_humidity_percent != 0) "
2026-04-12 17:28:15 -03:00
)
2026-04-13 02:26:09 -03:00
CLIMATE_TEMP_MAX = f " max( { CLIMATE_TEMP_SERIES } ) or on() vector(0) "
CLIMATE_TEMP_FAHRENHEIT_MAX = f " max( { CLIMATE_TEMP_FAHRENHEIT_SERIES } ) or on() vector(0) "
CLIMATE_PRESSURE_CURRENT = f " max( { CLIMATE_PRESSURE_SERIES } ) or on() vector(0) "
CLIMATE_HUMIDITY_MAX = f " max( { CLIMATE_HUMIDITY_SERIES } ) or on() vector(0) "
CLIMATE_TEMP_MIN_BOUND_SERIES = f " (min_over_time( { CLIMATE_TEMP_SERIES } [$__range]) - 0.08) "
CLIMATE_TEMP_MAX_BOUND_SERIES = f " (max_over_time( { CLIMATE_TEMP_SERIES } [$__range]) + 0.08) "
2026-04-13 00:17:29 -03:00
CLIMATE_HUMIDITY_MIN_BOUND_SERIES = (
2026-04-13 02:26:09 -03:00
f " clamp_min((min_over_time( { CLIMATE_HUMIDITY_SERIES } [$__range]) - 0.35), 0) "
2026-04-13 00:17:29 -03:00
)
CLIMATE_HUMIDITY_MAX_BOUND_SERIES = (
2026-04-13 02:26:09 -03:00
f " clamp_max((max_over_time( { CLIMATE_HUMIDITY_SERIES } [$__range]) + 0.35), 100) "
2026-04-13 00:17:29 -03:00
)
CLIMATE_PRESSURE_MIN_BOUND_SERIES = (
2026-04-13 02:26:09 -03:00
f " clamp_min((min_over_time( { CLIMATE_PRESSURE_SERIES } [$__range]) - 0.03), 0) "
2026-04-13 00:17:29 -03:00
)
CLIMATE_PRESSURE_MAX_BOUND_SERIES = (
2026-04-13 02:26:09 -03:00
f " (max_over_time( { CLIMATE_PRESSURE_SERIES } [$__range]) + 0.03) "
2026-04-13 00:17:29 -03:00
)
2026-04-03 20:45:40 -03:00
CLIMATE_FAN_OUTLET_CURRENT = (
2026-04-12 17:56:54 -03:00
f ' max(max without ( { CLIMATE_DEDUP_LABELS } ) (typhon_fan_speed_level {{ fan_group= " outlet " }} )) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_INSIDE_INLET_CURRENT = (
2026-04-12 17:56:54 -03:00
f ' max(max without ( { CLIMATE_DEDUP_LABELS } ) (typhon_fan_speed_level {{ fan_group= " inside_inlet " }} )) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_OUTSIDE_INLET_CURRENT = (
2026-04-12 17:56:54 -03:00
f ' max(max without ( { CLIMATE_DEDUP_LABELS } ) (typhon_fan_speed_level {{ fan_group= " outside_inlet " }} )) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_INTERIOR_CURRENT = (
2026-04-12 17:56:54 -03:00
f ' max(max without ( { CLIMATE_DEDUP_LABELS } ) (typhon_fan_speed_level {{ fan_group=~ " interior|unknown " }} )) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_OUTLET_SERIES = (
2026-04-12 17:56:54 -03:00
f ' max without ( { CLIMATE_DEDUP_LABELS } ) (typhon_fan_speed_level {{ fan_group= " outlet " }} ) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_INSIDE_INLET_SERIES = (
2026-04-12 17:56:54 -03:00
f ' max without ( { CLIMATE_DEDUP_LABELS } ) (typhon_fan_speed_level {{ fan_group= " inside_inlet " }} ) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_OUTSIDE_INLET_SERIES = (
2026-04-12 17:56:54 -03:00
f ' max without ( { CLIMATE_DEDUP_LABELS } ) (typhon_fan_speed_level {{ fan_group= " outside_inlet " }} ) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_INTERIOR_SERIES = (
2026-04-12 17:56:54 -03:00
f ' max without ( { CLIMATE_DEDUP_LABELS } ) (typhon_fan_speed_level {{ fan_group=~ " interior|unknown " }} ) '
2026-04-03 20:45:40 -03:00
)
2026-04-12 19:46:39 -03:00
def with_metric_label ( expr , metric ) :
return f ' label_replace(( { expr } ), " metric " , " { metric } " , " __name__ " , " .* " ) '
def with_ups_metric_labels ( expr , ups , metric ) :
return (
f ' label_replace( '
f ' label_replace(( { expr } ), " ups " , " { ups } " , " __name__ " , " .* " ), '
f ' " metric " , " { metric } " , " __name__ " , " .* " '
f ' ) '
)
CLIMATE_CURRENT_ROW_EXPR = " or " . join (
[
with_metric_label ( CLIMATE_TEMP_MAX , " Temp °C " ) ,
with_metric_label ( CLIMATE_TEMP_FAHRENHEIT_MAX , " Temp °F " ) ,
with_metric_label ( CLIMATE_HUMIDITY_MAX , " Humidity " ) ,
with_metric_label ( CLIMATE_PRESSURE_CURRENT , " Pressure " ) ,
]
)
CLIMATE_FAN_CURRENT_ROW_EXPR = " or " . join (
[
with_metric_label ( f " round( { CLIMATE_FAN_OUTLET_CURRENT } ) " , " Outlet " ) ,
with_metric_label ( f " round( { CLIMATE_FAN_INSIDE_INLET_CURRENT } ) " , " Inlet In " ) ,
with_metric_label ( f " round( { CLIMATE_FAN_OUTSIDE_INLET_CURRENT } ) " , " Inlet Out " ) ,
with_metric_label ( f " round( { CLIMATE_FAN_INTERIOR_CURRENT } ) " , " Interior " ) ,
]
)
UPS_CURRENT_ROW_EXPR = " or " . join (
[
2026-04-12 22:53:23 -03:00
with_ups_metric_labels ( ANANKE_UPS_DRAW_WATTS_DB , ANANKE_UPS_DB_NAME , " Draw " ) ,
with_ups_metric_labels ( ANANKE_UPS_RUNTIME_DB , ANANKE_UPS_DB_NAME , " Runtime " ) ,
with_ups_metric_labels ( ANANKE_UPS_DRAW_WATTS_TETHYS , ANANKE_UPS_TETHYS_NAME , " Draw " ) ,
with_ups_metric_labels ( ANANKE_UPS_RUNTIME_TETHYS , ANANKE_UPS_TETHYS_NAME , " Runtime " ) ,
2026-04-12 22:07:58 -03:00
]
)
2026-01-22 18:23:17 -03:00
POSTGRES_CONN_USED = (
' label_replace(sum(pg_stat_activity_count), " conn " , " used " , " __name__ " , " .* " ) '
' or label_replace(max(pg_settings_max_connections), " conn " , " max " , " __name__ " , " .* " ) '
2026-01-22 15:23:23 -03:00
)
POSTGRES_CONN_HOTTEST = ' topk(1, sum by (datname) (pg_stat_activity_count)) '
2026-01-21 13:37:36 -03:00
ONEOFF_JOB_OWNER = (
' label_replace(kube_job_owner { owner_kind= " CronJob " }, " owner_name " , " $1 " , " job_name " , " (.*) " ) '
)
ONEOFF_JOB_PODS = f ' (kube_pod_owner {{ owner_kind= " Job " }} unless on(namespace, owner_name) { ONEOFF_JOB_OWNER } ) '
ONEOFF_JOB_POD_AGE_HOURS = (
' ((time() - kube_pod_start_time { pod!= " " }) / 3600) '
f ' * on(namespace,pod) group_left(owner_name) { ONEOFF_JOB_PODS } '
' * on(namespace,pod) group_left(phase) '
' max by (namespace,pod,phase) (kube_pod_status_phase { phase=~ " Running|Succeeded " }) '
)
2026-01-21 14:30:55 -03:00
GLUE_LAST_SUCCESS_RANGE_HOURS = f " (time() - max_over_time( { GLUE_LAST_SUCCESS } [$__range])) / 3600 "
GLUE_LAST_SCHEDULE_RANGE_HOURS = f " (time() - max_over_time( { GLUE_LAST_SCHEDULE } [$__range])) / 3600 "
2025-11-18 10:47:24 -03:00
GPU_NODES = [ " titan-20 " , " titan-21 " , " titan-22 " , " titan-24 " ]
GPU_NODE_REGEX = " | " . join ( GPU_NODES )
2025-11-17 18:55:11 -03:00
TRAEFIK_ROUTER_EXPR = " sum by (router) (rate(traefik_router_requests_total[5m])) "
2025-11-18 14:08:33 -03:00
TRAEFIK_NET_INGRESS = (
' sum(rate(container_network_receive_bytes_total { namespace= " traefik " ,pod=~ " traefik-.* " }[5m])) '
2025-11-18 11:30:33 -03:00
" or on() vector(0) "
)
2025-11-18 14:08:33 -03:00
TRAEFIK_NET_EGRESS = (
' sum(rate(container_network_transmit_bytes_total { namespace= " traefik " ,pod=~ " traefik-.* " }[5m])) '
" or on() vector(0) "
)
2025-11-18 15:55:24 -03:00
NET_CLUSTER_RX = (
' sum(rate(container_network_receive_bytes_total { namespace!= " " ,pod!= " " ,container!= " " }[5m])) '
" or on() vector(0) "
)
NET_CLUSTER_TX = (
2025-11-18 11:30:33 -03:00
' sum(rate(container_network_transmit_bytes_total { namespace!= " " ,pod!= " " ,container!= " " }[5m])) '
" or on() vector(0) "
)
2025-11-18 16:18:52 -03:00
PHYSICAL_NET_FILTER = ' device!~ " lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.* " '
NET_NODE_RX_PHYS = (
f ' sum(rate(node_network_receive_bytes_total {{ { PHYSICAL_NET_FILTER } }} [5m])) or on() vector(0) '
)
NET_NODE_TX_PHYS = (
f ' sum(rate(node_network_transmit_bytes_total {{ { PHYSICAL_NET_FILTER } }} [5m])) or on() vector(0) '
)
NET_TOTAL_EXPR = NET_NODE_TX_PHYS
NET_INGRESS_EXPR = NET_NODE_RX_PHYS
NET_EGRESS_EXPR = NET_NODE_TX_PHYS
NET_INTERNAL_EXPR = (
2025-11-18 17:09:13 -03:00
' sum(rate(container_network_receive_bytes_total { namespace!= " traefik " ,pod!= " " }[5m]) '
' + rate(container_network_transmit_bytes_total { namespace!= " traefik " ,pod!= " " }[5m])) '
2025-11-18 16:18:52 -03:00
' or on() vector(0) '
)
2025-12-12 18:00:43 -03:00
APISERVER_5XX_RATE = ' sum(rate(apiserver_request_total { code=~ " 5.. " }[5m])) '
APISERVER_P99_LATENCY_MS = (
" histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000 "
)
ETCD_P99_LATENCY_MS = (
" histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000 "
)
TRAEFIK_TOTAL_5M = " sum(rate(traefik_entrypoint_requests_total[5m])) "
TRAEFIK_SUCCESS_5M = ' sum(rate(traefik_entrypoint_requests_total { code!~ " 5.. " }[5m])) '
TRAEFIK_SLI_5M = f " ( { TRAEFIK_SUCCESS_5M } ) / clamp_min( { TRAEFIK_TOTAL_5M } , 1) "
TRAEFIK_P99_LATENCY_MS = (
" histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000 "
)
TRAEFIK_P95_LATENCY_MS = (
" histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000 "
)
SLO_AVAILABILITY = 0.999
def traefik_sli ( window ) :
total = f ' sum(rate(traefik_entrypoint_requests_total[ { window } ])) '
success = f ' sum(rate(traefik_entrypoint_requests_total {{ code!~ " 5.. " }} [ { window } ])) '
return f " ( { success } ) / clamp_min( { total } , 1) "
def traefik_burn ( window ) :
sli = traefik_sli ( window )
return f " (1 - ( { sli } )) / { 1 - SLO_AVAILABILITY } "
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Panel factories
# ---------------------------------------------------------------------------
2025-11-17 14:22:46 -03:00
2025-11-17 16:27:38 -03:00
def stat_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " none " ,
2025-12-12 16:15:37 -03:00
decimals = None ,
2025-11-17 16:27:38 -03:00
thresholds = None ,
text_mode = " value " ,
legend = None ,
2025-11-17 19:38:40 -03:00
instant = False ,
2025-11-17 16:27:38 -03:00
value_suffix = None ,
links = None ,
2026-04-03 20:45:40 -03:00
targets = None ,
field_overrides = None ,
description = None ,
2026-04-04 04:34:18 -03:00
orientation = None ,
2026-04-04 04:40:22 -03:00
wide_layout = None ,
2026-04-12 19:23:10 -03:00
graph_mode = " area " ,
justify_mode = " center " ,
2026-04-12 20:30:17 -03:00
title_size = None ,
value_size = None ,
2025-11-17 16:27:38 -03:00
) :
""" Return a Grafana stat panel definition. """
2025-11-17 14:22:46 -03:00
defaults = {
2025-12-12 20:44:20 -03:00
" color " : { " mode " : " thresholds " } ,
2025-11-17 14:22:46 -03:00
" mappings " : [ ] ,
" thresholds " : thresholds
or {
" mode " : " absolute " ,
" steps " : [
{ " color " : " rgba(115, 115, 115, 1) " , " value " : None } ,
{ " color " : " green " , " value " : 1 } ,
] ,
} ,
" unit " : unit ,
2025-11-17 16:27:38 -03:00
" custom " : { " displayMode " : " auto " } ,
2025-11-17 14:22:46 -03:00
}
2025-11-17 16:27:38 -03:00
if value_suffix :
defaults [ " custom " ] [ " valueSuffix " ] = value_suffix
2025-12-12 16:15:37 -03:00
if decimals is not None :
defaults [ " decimals " ] = decimals
2026-04-03 20:45:40 -03:00
target_list = targets if targets is not None else [ { " expr " : expr , " refId " : " A " } ]
2025-11-17 14:22:46 -03:00
panel = {
" id " : panel_id ,
" type " : " stat " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2026-04-03 20:45:40 -03:00
" targets " : target_list ,
" fieldConfig " : { " defaults " : defaults , " overrides " : field_overrides or [ ] } ,
2025-11-17 14:22:46 -03:00
" options " : {
" colorMode " : " value " ,
2026-04-12 19:23:10 -03:00
" graphMode " : graph_mode ,
" justifyMode " : justify_mode ,
2025-11-17 14:22:46 -03:00
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" textMode " : text_mode ,
} ,
}
2026-04-04 04:34:18 -03:00
if orientation :
panel [ " options " ] [ " orientation " ] = orientation
2026-04-04 04:40:22 -03:00
if wide_layout is not None :
panel [ " options " ] [ " wideLayout " ] = wide_layout
2026-04-12 20:30:17 -03:00
if title_size is not None or value_size is not None :
panel [ " options " ] [ " text " ] = { }
if title_size is not None :
panel [ " options " ] [ " text " ] [ " titleSize " ] = title_size
if value_size is not None :
panel [ " options " ] [ " text " ] [ " valueSize " ] = value_size
2026-04-03 20:45:40 -03:00
if legend and len ( panel [ " targets " ] ) == 1 :
2025-11-17 14:22:46 -03:00
panel [ " targets " ] [ 0 ] [ " legendFormat " ] = legend
2025-11-17 19:38:40 -03:00
if instant :
2026-04-03 20:45:40 -03:00
for t in panel [ " targets " ] :
t . setdefault ( " instant " , True )
2025-11-17 16:27:38 -03:00
if links :
panel [ " links " ] = links
2026-04-03 20:45:40 -03:00
if description :
panel [ " description " ] = description
2025-11-17 14:22:46 -03:00
return panel
2025-11-18 12:11:47 -03:00
def gauge_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
min_value = 0 ,
max_value = 1 ,
thresholds = None ,
links = None ,
) :
return {
" id " : panel_id ,
" type " : " gauge " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
" targets " : [ { " expr " : expr , " refId " : " A " } ] ,
" fieldConfig " : {
" defaults " : {
" min " : min_value ,
" max " : max_value ,
" thresholds " : thresholds
or {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : max_value } ,
] ,
} ,
} ,
" overrides " : [ ] ,
} ,
" options " : {
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" orientation " : " auto " ,
" showThresholdMarkers " : False ,
" showThresholdLabels " : False ,
} ,
* * ( { " links " : links } if links else { } ) ,
}
2025-11-17 16:27:38 -03:00
def timeseries_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " none " ,
2026-01-21 15:01:02 -03:00
max_value = None ,
2025-11-17 16:27:38 -03:00
legend = None ,
legend_display = " table " ,
legend_placement = " bottom " ,
legend_calcs = None ,
time_from = None ,
links = None ,
2026-04-03 20:45:40 -03:00
targets = None ,
field_overrides = None ,
description = None ,
2025-11-17 16:27:38 -03:00
) :
""" Return a Grafana time-series panel definition. """
2026-04-03 20:45:40 -03:00
target_list = targets if targets is not None else [ { " expr " : expr , " refId " : " A " } ]
2025-11-17 14:22:46 -03:00
panel = {
" id " : panel_id ,
" type " : " timeseries " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2026-04-03 20:45:40 -03:00
" targets " : target_list ,
" fieldConfig " : { " defaults " : { " unit " : unit } , " overrides " : field_overrides or [ ] } ,
2025-11-17 14:22:46 -03:00
" options " : {
" legend " : {
" displayMode " : legend_display ,
" placement " : legend_placement ,
} ,
" tooltip " : { " mode " : " multi " } ,
} ,
}
2026-01-21 15:01:02 -03:00
if max_value is not None :
panel [ " fieldConfig " ] [ " defaults " ] [ " max " ] = max_value
2026-04-03 20:45:40 -03:00
if legend and len ( panel [ " targets " ] ) == 1 :
2025-11-17 14:22:46 -03:00
panel [ " targets " ] [ 0 ] [ " legendFormat " ] = legend
if legend_calcs :
panel [ " options " ] [ " legend " ] [ " calcs " ] = legend_calcs
if time_from :
panel [ " timeFrom " ] = time_from
2025-11-17 16:27:38 -03:00
if links :
panel [ " links " ] = links
2026-04-03 20:45:40 -03:00
if description :
panel [ " description " ] = description
2025-11-17 14:22:46 -03:00
return panel
2026-04-13 00:17:29 -03:00
def canvas_metric_grid_panel (
panel_id ,
title ,
grid ,
* ,
targets ,
field_overrides = None ,
links = None ,
description = None ,
metric_size = 26 ,
2026-04-13 00:21:56 -03:00
label_size = 12 ,
2026-04-13 01:08:58 -03:00
color_fields = None ,
thresholds = None ,
2026-04-13 00:17:29 -03:00
) :
""" Return a canvas panel with a deterministic 2x2 metric layout. """
2026-04-13 01:08:58 -03:00
if color_fields is None :
color_fields = [ targets [ 0 ] [ " legendFormat " ] , targets [ 1 ] [ " legendFormat " ] , targets [ 2 ] [ " legendFormat " ] , targets [ 3 ] [ " legendFormat " ] ]
2026-04-13 00:21:56 -03:00
def text_element ( name , text , left , top ) :
return {
" type " : " text " ,
" name " : name ,
" constraint " : { " horizontal " : " left " , " vertical " : " top " } ,
" placement " : {
" left " : left ,
" top " : top ,
" width " : 146 ,
" height " : 14 ,
} ,
" background " : { " color " : { " fixed " : " transparent " } } ,
" border " : { " color " : { " fixed " : " transparent " } } ,
" config " : {
2026-04-13 01:08:58 -03:00
" align " : " center " ,
2026-04-13 00:21:56 -03:00
" valign " : " middle " ,
" size " : label_size ,
" color " : { " fixed " : " text " } ,
" text " : {
" fixed " : text ,
} ,
} ,
" links " : [ ] ,
}
2026-04-13 00:17:29 -03:00
def metric_element ( name , field , left , top ) :
return {
" type " : " metric-value " ,
" name " : name ,
" constraint " : { " horizontal " : " left " , " vertical " : " top " } ,
" placement " : {
" left " : left ,
" top " : top ,
" width " : 146 ,
2026-04-13 00:21:56 -03:00
" height " : 42 ,
2026-04-13 00:17:29 -03:00
} ,
" background " : { " color " : { " fixed " : " transparent " } } ,
" border " : { " color " : { " fixed " : " transparent " } } ,
" config " : {
2026-04-13 01:08:58 -03:00
" align " : " center " ,
2026-04-13 00:17:29 -03:00
" valign " : " middle " ,
" size " : metric_size ,
2026-04-13 01:08:58 -03:00
" color " : { " field " : color_fields [ int ( name . split ( ) [ - 1 ] ) - 1 ] , " fixed " : " text " } ,
2026-04-13 00:17:29 -03:00
" text " : {
" mode " : " field " ,
" field " : field ,
" fixed " : " " ,
} ,
} ,
" links " : [ ] ,
}
panel = {
" id " : panel_id ,
" type " : " canvas " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
" targets " : targets ,
" fieldConfig " : {
" defaults " : {
" mappings " : [ ] ,
2026-04-13 01:08:58 -03:00
" thresholds " : thresholds
or {
2026-04-13 00:17:29 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " rgba(115, 115, 115, 1) " , " value " : None } ,
{ " color " : " green " , " value " : 1 } ,
] ,
} ,
" color " : { " mode " : " thresholds " } ,
} ,
" overrides " : field_overrides or [ ] ,
} ,
" options " : {
" inlineEditing " : False ,
" showAdvancedTypes " : True ,
" panZoom " : False ,
" infinitePan " : False ,
" root " : {
" type " : " frame " ,
" name " : f " { title } frame " ,
" elements " : [
2026-04-13 02:26:09 -03:00
text_element ( " Cell 1 label " , targets [ 0 ] [ " legendFormat " ] , 12 , 18 ) ,
metric_element ( " Cell 1 " , targets [ 0 ] [ " legendFormat " ] , 12 , 32 ) ,
text_element ( " Cell 2 label " , targets [ 1 ] [ " legendFormat " ] , 168 , 18 ) ,
metric_element ( " Cell 2 " , targets [ 1 ] [ " legendFormat " ] , 168 , 32 ) ,
text_element ( " Cell 3 label " , targets [ 2 ] [ " legendFormat " ] , 12 , 76 ) ,
metric_element ( " Cell 3 " , targets [ 2 ] [ " legendFormat " ] , 12 , 90 ) ,
text_element ( " Cell 4 label " , targets [ 3 ] [ " legendFormat " ] , 168 , 76 ) ,
metric_element ( " Cell 4 " , targets [ 3 ] [ " legendFormat " ] , 168 , 90 ) ,
2026-04-13 00:17:29 -03:00
] ,
" background " : { " color " : { " fixed " : " transparent " } } ,
" border " : { " color " : { " fixed " : " transparent " } } ,
} ,
} ,
}
if links :
panel [ " links " ] = links
if description :
panel [ " description " ] = description
return panel
2026-04-13 04:51:00 -03:00
def canvas_two_metric_row_panel (
panel_id ,
title ,
grid ,
* ,
targets ,
field_overrides = None ,
links = None ,
description = None ,
metric_size = 30 ,
2026-04-13 05:00:01 -03:00
metric_top = 10 ,
2026-04-13 05:16:37 -03:00
metric_height = 42 ,
2026-04-13 04:51:00 -03:00
color_fields = None ,
thresholds = None ,
) :
""" Return a canvas panel with two values rendered side-by-side and no labels. """
if color_fields is None :
color_fields = [ targets [ 0 ] [ " legendFormat " ] , targets [ 1 ] [ " legendFormat " ] ]
def metric_element ( name , field , left , top , color_field ) :
return {
" type " : " metric-value " ,
" name " : name ,
" constraint " : { " horizontal " : " left " , " vertical " : " top " } ,
" placement " : {
" left " : left ,
" top " : top ,
" width " : 146 ,
2026-04-13 05:16:37 -03:00
" height " : metric_height ,
2026-04-13 04:51:00 -03:00
} ,
" background " : { " color " : { " fixed " : " transparent " } } ,
" border " : { " color " : { " fixed " : " transparent " } } ,
" config " : {
" align " : " center " ,
" valign " : " middle " ,
" size " : metric_size ,
" color " : { " field " : color_field , " fixed " : " text " } ,
" text " : {
" mode " : " field " ,
" field " : field ,
" fixed " : " " ,
} ,
} ,
" links " : [ ] ,
}
panel = {
" id " : panel_id ,
" type " : " canvas " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
" targets " : targets ,
" fieldConfig " : {
" defaults " : {
" mappings " : [ ] ,
" thresholds " : thresholds
or {
" mode " : " absolute " ,
" steps " : [
{ " color " : " rgba(115, 115, 115, 1) " , " value " : None } ,
{ " color " : " green " , " value " : 1 } ,
] ,
} ,
" color " : { " mode " : " thresholds " } ,
} ,
" overrides " : field_overrides or [ ] ,
} ,
" options " : {
" inlineEditing " : False ,
" showAdvancedTypes " : True ,
" panZoom " : False ,
" infinitePan " : False ,
" root " : {
" type " : " frame " ,
" name " : f " { title } frame " ,
" elements " : [
2026-04-13 05:00:01 -03:00
metric_element ( " Cell 1 " , targets [ 0 ] [ " legendFormat " ] , 12 , metric_top , color_fields [ 0 ] ) ,
metric_element ( " Cell 2 " , targets [ 1 ] [ " legendFormat " ] , 168 , metric_top , color_fields [ 1 ] ) ,
2026-04-13 04:51:00 -03:00
] ,
" background " : { " color " : { " fixed " : " transparent " } } ,
" border " : { " color " : { " fixed " : " transparent " } } ,
} ,
} ,
}
if links :
panel [ " links " ] = links
if description :
panel [ " description " ] = description
return panel
2025-11-17 16:27:38 -03:00
def table_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " none " ,
transformations = None ,
2025-12-13 04:00:57 -03:00
instant = False ,
2025-12-13 17:32:19 -03:00
options = None ,
2025-12-13 17:55:52 -03:00
filterable = True ,
2025-12-13 18:03:51 -03:00
footer = None ,
2025-12-13 18:23:19 -03:00
format = None ,
2026-04-12 19:46:39 -03:00
targets = None ,
field_overrides = None ,
links = None ,
description = None ,
2025-11-17 16:27:38 -03:00
) :
""" Return a Grafana table panel definition. """
2025-12-13 11:53:27 -03:00
# Optional PromQL subquery helpers in expr: share(), etc.
2025-12-13 17:35:52 -03:00
panel_options = { " showHeader " : True , " columnFilters " : False }
2025-12-13 17:32:19 -03:00
if options :
panel_options . update ( options )
2025-12-13 18:03:51 -03:00
if footer is not None :
panel_options [ " footer " ] = footer
2025-12-13 17:55:52 -03:00
field_defaults = { " unit " : unit , " custom " : { " filterable " : filterable } }
2026-04-12 19:46:39 -03:00
target_list = targets if targets is not None else [ { " expr " : expr , " refId " : " A " } ]
if instant :
for target in target_list :
target . setdefault ( " instant " , True )
2025-12-13 18:23:19 -03:00
if format :
2026-04-12 19:46:39 -03:00
for target in target_list :
target . setdefault ( " format " , format )
2025-11-17 14:22:46 -03:00
panel = {
" id " : panel_id ,
" type " : " table " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2026-04-12 19:46:39 -03:00
" targets " : target_list ,
" fieldConfig " : { " defaults " : field_defaults , " overrides " : field_overrides or [ ] } ,
2025-12-13 17:32:19 -03:00
" options " : panel_options ,
2025-11-17 14:22:46 -03:00
}
if transformations :
panel [ " transformations " ] = transformations
2026-04-12 19:46:39 -03:00
if links :
panel [ " links " ] = links
if description :
panel [ " description " ] = description
2025-11-17 14:22:46 -03:00
return panel
2026-01-01 14:44:33 -03:00
def pie_panel ( panel_id , title , expr , grid , * , links = None , description = None ) :
2025-11-17 16:27:38 -03:00
""" Return a pie chart panel with readable namespace labels. """
2026-01-01 14:44:33 -03:00
panel = {
2025-11-17 14:22:46 -03:00
" id " : panel_id ,
" type " : " piechart " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2025-11-17 19:38:40 -03:00
" targets " : [ { " expr " : expr , " refId " : " A " , " legendFormat " : " {{ namespace}} " } ] ,
2025-11-18 14:08:33 -03:00
" fieldConfig " : {
" defaults " : {
" unit " : " percent " ,
" color " : { " mode " : " palette-classic " } ,
} ,
" overrides " : [ ] ,
} ,
2025-11-17 14:22:46 -03:00
" options " : {
" legend " : { " displayMode " : " list " , " placement " : " right " } ,
" pieType " : " pie " ,
2025-12-12 20:40:32 -03:00
" displayLabels " : [ ] ,
2025-11-18 14:08:33 -03:00
" tooltip " : { " mode " : " single " } ,
" colorScheme " : " interpolateSpectral " ,
" colorBy " : " value " ,
2025-11-17 14:22:46 -03:00
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
} ,
}
2026-01-01 14:44:33 -03:00
if links :
panel [ " links " ] = links
if description :
panel [ " description " ] = description
return panel
2025-11-17 14:22:46 -03:00
2026-01-01 14:44:33 -03:00
def namespace_scope_variable ( var_name , label ) :
2026-01-01 14:16:08 -03:00
options = [
{
" text " : " workload namespaces only " ,
" value " : NAMESPACE_SCOPE_WORKLOAD ,
" selected " : True ,
} ,
{ " text " : " all namespaces " , " value " : NAMESPACE_SCOPE_ALL , " selected " : False } ,
{
" text " : " infrastructure namespaces only " ,
" value " : NAMESPACE_SCOPE_INFRA ,
" selected " : False ,
} ,
]
query = (
" workload namespaces only : "
+ NAMESPACE_SCOPE_WORKLOAD
+ " ,all namespaces : "
+ NAMESPACE_SCOPE_ALL
+ " ,infrastructure namespaces only : "
+ NAMESPACE_SCOPE_INFRA
)
return {
2026-01-01 14:44:33 -03:00
" name " : var_name ,
" label " : label ,
2026-01-01 14:16:08 -03:00
" type " : " custom " ,
" query " : query ,
" current " : { " text " : options [ 0 ] [ " text " ] , " value " : options [ 0 ] [ " value " ] , " selected " : True } ,
" options " : options ,
2026-01-01 14:44:33 -03:00
" hide " : 2 ,
2026-01-01 14:16:08 -03:00
" multi " : False ,
" includeAll " : False ,
" refresh " : 1 ,
" sort " : 0 ,
" skipUrlSync " : False ,
}
2026-01-01 14:44:33 -03:00
def namespace_scope_links ( var_name ) :
def with_value ( value ) :
encoded = urllib . parse . quote ( value , safe = " " )
params = [ ]
for other in NAMESPACE_SCOPE_VARS :
if other == var_name :
params . append ( f " var- { other } = { encoded } " )
else :
params . append ( f " var- { other } =$ {{ { other } }} " )
return " ? " + " & " . join ( params )
return [
{ " title " : " Workload namespaces only " , " url " : with_value ( NAMESPACE_SCOPE_WORKLOAD ) , " targetBlank " : False } ,
{ " title " : " All namespaces " , " url " : with_value ( NAMESPACE_SCOPE_ALL ) , " targetBlank " : False } ,
{
" title " : " Infrastructure namespaces only " ,
" url " : with_value ( NAMESPACE_SCOPE_INFRA ) ,
" targetBlank " : False ,
} ,
]
2025-12-12 20:20:13 -03:00
def bargauge_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " none " ,
2026-01-21 11:29:29 -03:00
legend = None ,
2025-12-12 20:20:13 -03:00
links = None ,
limit = None ,
2026-01-21 11:29:29 -03:00
sort_order = " desc " ,
2025-12-12 20:20:13 -03:00
thresholds = None ,
decimals = None ,
2025-12-12 20:30:00 -03:00
instant = False ,
2026-01-21 15:01:02 -03:00
overrides = None ,
2025-12-12 20:20:13 -03:00
) :
2025-12-02 13:16:00 -03:00
""" Return a bar gauge panel with label-aware reduction. """
2026-01-21 15:12:53 -03:00
cleaned_expr = expr . strip ( )
if not cleaned_expr . startswith ( ( " sort( " , " sort_desc( " ) ) :
if sort_order == " desc " :
expr = f " sort_desc( { expr } ) "
elif sort_order == " asc " :
expr = f " sort( { expr } ) "
2025-12-02 13:16:00 -03:00
panel = {
" id " : panel_id ,
" type " : " bargauge " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2025-12-12 20:30:00 -03:00
" targets " : [
2026-01-21 11:29:29 -03:00
{
" expr " : expr ,
" refId " : " A " ,
" legendFormat " : legend or " {{ node}} " ,
* * ( { " instant " : True } if instant else { } ) ,
}
2025-12-12 20:30:00 -03:00
] ,
2025-12-02 13:16:00 -03:00
" fieldConfig " : {
" defaults " : {
" unit " : unit ,
" min " : 0 ,
" max " : 100 if unit == " percent " else None ,
2025-12-12 20:20:13 -03:00
" thresholds " : thresholds
or {
2025-12-02 13:16:00 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 70 } ,
{ " color " : " red " , " value " : 85 } ,
] ,
} ,
} ,
" overrides " : [ ] ,
} ,
" options " : {
" displayMode " : " gradient " ,
" orientation " : " horizontal " ,
" reduceOptions " : {
" calcs " : [ " lastNotNull " ] ,
2025-12-02 14:56:36 -03:00
" fields " : " " ,
2025-12-02 13:16:00 -03:00
" values " : False ,
} ,
} ,
}
2026-01-21 15:01:02 -03:00
if overrides :
panel [ " fieldConfig " ] [ " overrides " ] . extend ( overrides )
2025-12-12 20:20:13 -03:00
if decimals is not None :
panel [ " fieldConfig " ] [ " defaults " ] [ " decimals " ] = decimals
2025-12-02 13:16:00 -03:00
if links :
panel [ " links " ] = links
2025-12-12 18:51:43 -03:00
# Keep bars ordered by value descending for readability.
panel [ " transformations " ] = [
{
" id " : " sortBy " ,
2026-01-21 11:29:29 -03:00
" options " : { " fields " : [ " Value " ] , " order " : sort_order } ,
2025-12-12 18:51:43 -03:00
}
]
2025-12-12 18:56:13 -03:00
if limit :
panel [ " transformations " ] . append ( { " id " : " limit " , " options " : { " limit " : limit } } )
2025-12-02 13:16:00 -03:00
return panel
2026-04-13 00:25:33 -03:00
def _jenkins_weather_status_expr ( base_expr , comparator ) :
return (
2026-04-13 04:03:46 -03:00
f " ( { base_expr } ) and on(exported_job,job_url,weather_icon) "
2026-04-13 00:25:33 -03:00
f " ( { JENKINS_BUILD_WEATHER_LAST_STATUS } { comparator } ) "
)
def jenkins_weather_bargauge_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " h " ,
decimals = 2 ,
sort_order = " asc " ,
limit = 12 ,
thresholds = None ,
links = None ,
description = None ,
) :
panel = {
" id " : panel_id ,
" type " : " bargauge " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
" targets " : [
{
" refId " : " A " ,
" expr " : _jenkins_weather_status_expr ( expr , " == 1 " ) ,
2026-04-13 04:03:46 -03:00
" legendFormat " : " {{ weather_icon}} {{ exported_job}} " ,
2026-04-13 00:25:33 -03:00
" instant " : True ,
} ,
{
" refId " : " B " ,
" expr " : _jenkins_weather_status_expr ( expr , " == 0 " ) ,
2026-04-13 04:03:46 -03:00
" legendFormat " : " {{ weather_icon}} {{ exported_job}} " ,
2026-04-13 00:25:33 -03:00
" instant " : True ,
} ,
{
" refId " : " C " ,
" expr " : _jenkins_weather_status_expr ( expr , " == 2 " ) ,
2026-04-13 04:03:46 -03:00
" legendFormat " : " {{ weather_icon}} {{ exported_job}} " ,
2026-04-13 00:25:33 -03:00
" instant " : True ,
} ,
{
" refId " : " D " ,
" expr " : _jenkins_weather_status_expr ( expr , " < 0 " ) ,
2026-04-13 04:03:46 -03:00
" legendFormat " : " {{ weather_icon}} {{ exported_job}} " ,
2026-04-13 00:25:33 -03:00
" instant " : True ,
} ,
] ,
" fieldConfig " : {
" defaults " : {
" unit " : unit ,
" min " : 0 ,
" thresholds " : thresholds
or {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 6 } ,
{ " color " : " orange " , " value " : 24 } ,
{ " color " : " red " , " value " : 72 } ,
] ,
} ,
" links " : [
{
" title " : " Open Jenkins job " ,
" url " : " $ {__field.labels.job_url} " ,
" targetBlank " : True ,
}
] ,
} ,
" overrides " : [
{
" matcher " : { " id " : " byFrameRefID " , " options " : " A " } ,
" properties " : [ { " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " green " } } ] ,
} ,
{
" matcher " : { " id " : " byFrameRefID " , " options " : " B " } ,
" properties " : [ { " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " red " } } ] ,
} ,
{
" matcher " : { " id " : " byFrameRefID " , " options " : " C " } ,
" properties " : [ { " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " yellow " } } ] ,
} ,
{
" matcher " : { " id " : " byFrameRefID " , " options " : " D " } ,
" properties " : [ { " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " gray " } } ] ,
} ,
] ,
} ,
" options " : {
" displayMode " : " basic " ,
" orientation " : " horizontal " ,
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
} ,
" transformations " : [
{ " id " : " sortBy " , " options " : { " fields " : [ " Value " ] , " order " : sort_order } } ,
] ,
}
if decimals is not None :
panel [ " fieldConfig " ] [ " defaults " ] [ " decimals " ] = decimals
if limit :
panel [ " transformations " ] . append ( { " id " : " limit " , " options " : { " limit " : limit } } )
if links :
panel [ " links " ] = links
if description :
panel [ " description " ] = description
return panel
2025-11-17 14:22:46 -03:00
def text_panel ( panel_id , title , content , grid ) :
return {
" id " : panel_id ,
" type " : " text " ,
" title " : title ,
" gridPos " : grid ,
" datasource " : None ,
" options " : { " mode " : " markdown " , " content " : content } ,
}
2025-11-17 16:27:38 -03:00
def link_to ( uid ) :
return [ { " title " : f " Open { uid } dashboard " , " url " : f " /d/ { uid } " , " targetBlank " : True } ]
2025-11-17 14:22:46 -03:00
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Dashboard builders
# ---------------------------------------------------------------------------
2025-11-17 14:22:46 -03:00
def build_overview ( ) :
panels = [ ]
2025-11-17 16:27:38 -03:00
2025-12-12 15:23:51 -03:00
count_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
] ,
}
2026-01-21 13:37:36 -03:00
age_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 6 } ,
{ " color " : " orange " , " value " : 24 } ,
{ " color " : " red " , " value " : 48 } ,
] ,
}
2025-11-18 15:55:24 -03:00
2025-12-12 15:23:51 -03:00
row1_stats = [
{
" id " : 2 ,
" title " : " Control Plane Ready " ,
" expr " : f ' sum(kube_node_status_condition {{ condition= " Ready " ,status= " true " ,node=~ " { CONTROL_REGEX } " }} ) ' ,
" kind " : " gauge " ,
" max_value " : CONTROL_TOTAL ,
" thresholds " : {
2025-11-17 19:24:03 -03:00
" mode " : " absolute " ,
" steps " : [
2025-11-18 11:12:03 -03:00
{ " color " : " red " , " value " : None } ,
2025-11-17 19:24:03 -03:00
{ " color " : " green " , " value " : CONTROL_TOTAL } ,
] ,
2025-12-12 15:23:51 -03:00
} ,
} ,
{
" id " : 3 ,
" title " : " Control Plane Workloads " ,
" expr " : CONTROL_WORKLOADS_EXPR ,
" kind " : " stat " ,
2025-12-12 20:30:00 -03:00
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
2025-12-12 20:50:41 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 20:30:00 -03:00
] ,
} ,
2025-12-12 15:23:51 -03:00
" links " : link_to ( " atlas-pods " ) ,
} ,
2025-12-12 15:56:33 -03:00
{
" id " : 5 ,
" title " : " Stuck Terminating " ,
" expr " : STUCK_TERMINATING_EXPR ,
" kind " : " stat " ,
2025-12-12 20:30:00 -03:00
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
2025-12-12 20:50:41 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 20:30:00 -03:00
] ,
} ,
2025-12-12 15:56:33 -03:00
" links " : link_to ( " atlas-pods " ) ,
} ,
2025-12-12 15:23:51 -03:00
{
" id " : 27 ,
2025-12-19 13:46:34 -03:00
" title " : " Atlas Availability " ,
2025-12-12 16:11:28 -03:00
" expr " : UPTIME_PERCENT_EXPR ,
2025-12-12 15:23:51 -03:00
" kind " : " stat " ,
2025-12-12 16:11:28 -03:00
" thresholds " : UPTIME_PERCENT_THRESHOLDS ,
2025-12-12 16:15:37 -03:00
" unit " : " percentunit " ,
2025-12-19 15:18:14 -03:00
" decimals " : 4 ,
2025-12-12 15:23:51 -03:00
" text_mode " : " value " ,
} ,
{
" id " : 4 ,
" title " : " Problem Pods " ,
" expr " : PROBLEM_PODS_EXPR ,
" kind " : " stat " ,
2025-12-12 20:30:00 -03:00
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
2025-12-12 20:50:41 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 20:30:00 -03:00
] ,
} ,
2025-12-12 15:23:51 -03:00
" links " : link_to ( " atlas-pods " ) ,
} ,
{
" id " : 6 ,
" title " : " CrashLoop / ImagePull " ,
" expr " : CRASHLOOP_EXPR ,
" kind " : " stat " ,
2025-12-12 20:30:00 -03:00
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
2025-12-12 20:50:41 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 20:30:00 -03:00
] ,
} ,
2025-12-12 15:23:51 -03:00
" links " : link_to ( " atlas-pods " ) ,
} ,
{
2025-12-12 15:56:33 -03:00
" id " : 1 ,
" title " : " Workers Ready " ,
" expr " : f ' sum(kube_node_status_condition {{ condition= " Ready " ,status= " true " ,node=~ " { WORKER_REGEX } " }} ) ' ,
" kind " : " gauge " ,
" max_value " : WORKER_TOTAL ,
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " orange " , " value " : WORKER_TOTAL - 2 } ,
{ " color " : " yellow " , " value " : WORKER_TOTAL - 1 } ,
{ " color " : " green " , " value " : WORKER_TOTAL } ,
] ,
} ,
2025-12-12 15:23:51 -03:00
} ,
]
def gauge_grid ( idx ) :
width = GAUGE_WIDTHS [ idx ] if idx < len ( GAUGE_WIDTHS ) else 4
x = sum ( GAUGE_WIDTHS [ : idx ] )
return width , x
for idx , item in enumerate ( row1_stats ) :
panel_id = item [ " id " ]
2025-11-18 15:55:24 -03:00
width , x = gauge_grid ( idx )
2025-12-12 15:23:51 -03:00
grid = { " h " : 5 , " w " : width , " x " : x , " y " : 0 }
kind = item . get ( " kind " , " gauge " )
if kind == " stat " :
2025-11-18 17:09:13 -03:00
panels . append (
stat_panel (
panel_id ,
2025-12-12 15:23:51 -03:00
item [ " title " ] ,
item [ " expr " ] ,
grid ,
thresholds = item . get ( " thresholds " ) ,
2025-12-12 16:15:37 -03:00
legend = None ,
links = item . get ( " links " ) ,
text_mode = item . get ( " text_mode " , " value " ) ,
value_suffix = item . get ( " value_suffix " ) ,
unit = item . get ( " unit " , " none " ) ,
decimals = item . get ( " decimals " ) ,
)
)
2025-11-18 17:09:13 -03:00
else :
panels . append (
gauge_panel (
panel_id ,
2025-12-12 15:23:51 -03:00
item [ " title " ] ,
item [ " expr " ] ,
grid ,
min_value = 0 ,
max_value = item . get ( " max_value " , 5 ) ,
thresholds = item . get ( " thresholds " ) ,
links = item . get ( " links " ) ,
2025-11-18 17:09:13 -03:00
)
2025-11-17 14:22:46 -03:00
)
2025-11-17 16:27:38 -03:00
2026-04-09 14:56:43 -03:00
top_health_panels = [
2025-11-17 21:20:19 -03:00
( 7 , " Hottest node: CPU " , topk_with_node ( node_cpu_expr ( ) ) , " percent " ) ,
( 8 , " Hottest node: RAM " , topk_with_node ( node_mem_expr ( ) ) , " percent " ) ,
2025-11-17 20:19:20 -03:00
( 9 , " Hottest node: NET (rx+tx) " , topk_with_node ( node_net_expr ( ) ) , " Bps " ) ,
( 10 , " Hottest node: I/O (r+w) " , topk_with_node ( node_io_expr ( ) ) , " Bps " ) ,
2026-04-09 14:56:43 -03:00
( 23 , " Astreae Usage " , astreae_usage_expr ( " /mnt/astreae " ) , " percent " ) ,
( 24 , " Asteria Usage " , astreae_usage_expr ( " /mnt/asteria " ) , " percent " ) ,
( 25 , " Astreae Free " , astreae_free_expr ( " /mnt/astreae " ) , " decbytes " ) ,
( 26 , " Asteria Free " , astreae_free_expr ( " /mnt/asteria " ) , " decbytes " ) ,
2025-11-17 16:27:38 -03:00
]
2026-04-09 14:56:43 -03:00
for idx , ( panel_id , title , expr , unit ) in enumerate ( top_health_panels ) :
is_hottest_panel = panel_id in { 7 , 8 , 9 , 10 }
2025-11-17 16:27:38 -03:00
panels . append (
stat_panel (
panel_id ,
title ,
2025-11-17 20:19:20 -03:00
f " { expr } " ,
2026-04-09 14:56:43 -03:00
{ " h " : 2 , " w " : 3 , " x " : 3 * idx , " y " : 5 } ,
2025-11-17 16:27:38 -03:00
unit = unit ,
thresholds = PERCENT_THRESHOLDS if unit == " percent " else None ,
2026-04-09 14:56:43 -03:00
text_mode = " name_and_value " if is_hottest_panel else " value " ,
legend = " {{ node}} " if is_hottest_panel else None ,
instant = is_hottest_panel ,
links = link_to ( " atlas-storage " if panel_id in { 23 , 24 , 25 , 26 } else " atlas-nodes " ) ,
2025-11-17 16:27:38 -03:00
)
)
2026-01-05 21:55:59 -03:00
mail_bounce_rate_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 5 } ,
{ " color " : " orange " , " value " : 8 } ,
{ " color " : " red " , " value " : 10 } ,
] ,
}
2026-01-06 02:06:20 -03:00
mail_limit_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 70 } ,
{ " color " : " orange " , " value " : 85 } ,
{ " color " : " red " , " value " : 95 } ,
] ,
}
2026-01-06 02:34:52 -03:00
mail_success_thresholds = {
2026-01-05 21:55:59 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
2026-01-06 02:34:52 -03:00
{ " color " : " orange " , " value " : 90 } ,
{ " color " : " yellow " , " value " : 95 } ,
{ " color " : " green " , " value " : 98 } ,
2026-01-05 21:55:59 -03:00
] ,
}
2026-04-03 14:55:16 -03:00
panels . append (
2026-04-13 04:51:00 -03:00
canvas_two_metric_row_panel (
2026-04-03 14:55:16 -03:00
40 ,
2026-04-13 03:35:39 -03:00
f " { ANANKE_UPS_DB_NAME } UPS Current " ,
{ " h " : 2 , " w " : 6 , " x " : 0 , " y " : 7 } ,
2026-04-13 05:16:37 -03:00
metric_size = 24 ,
metric_top = 8 ,
metric_height = 34 ,
2026-04-13 00:17:29 -03:00
targets = [
2026-04-13 03:35:39 -03:00
{ " refId " : " A " , " expr " : ANANKE_UPS_DRAW_WATTS_DB , " legendFormat " : " Draw " , " instant " : True } ,
{ " refId " : " B " , " expr " : ANANKE_UPS_RUNTIME_DB , " legendFormat " : " Runtime " , " instant " : True } ,
2026-04-13 00:17:29 -03:00
] ,
2026-04-03 22:16:02 -03:00
field_overrides = [
2026-04-13 03:35:39 -03:00
{ " matcher " : { " id " : " byName " , " options " : " Draw " } , " properties " : [ { " id " : " unit " , " value " : " watt " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Runtime " } , " properties " : [ { " id " : " unit " , " value " : " s " } ] } ,
2026-04-03 22:16:02 -03:00
] ,
2026-04-13 03:35:39 -03:00
links = link_to ( " atlas-power " ) ,
)
)
panels . append (
2026-04-13 04:51:00 -03:00
canvas_two_metric_row_panel (
2026-04-13 03:35:39 -03:00
144 ,
f " { ANANKE_UPS_TETHYS_NAME } UPS Current " ,
2026-04-13 03:52:36 -03:00
{ " h " : 3 , " w " : 6 , " x " : 0 , " y " : 9 } ,
2026-04-13 04:51:00 -03:00
metric_size = 30 ,
2026-04-13 03:35:39 -03:00
targets = [
{ " refId " : " A " , " expr " : ANANKE_UPS_DRAW_WATTS_TETHYS , " legendFormat " : " Draw " , " instant " : True } ,
{ " refId " : " B " , " expr " : ANANKE_UPS_RUNTIME_TETHYS , " legendFormat " : " Runtime " , " instant " : True } ,
] ,
field_overrides = [
{ " matcher " : { " id " : " byName " , " options " : " Draw " } , " properties " : [ { " id " : " unit " , " value " : " watt " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Runtime " } , " properties " : [ { " id " : " unit " , " value " : " s " } ] } ,
2026-04-13 01:08:58 -03:00
] ,
2026-04-03 14:55:16 -03:00
links = link_to ( " atlas-power " ) ,
)
)
2026-04-12 22:25:34 -03:00
panels . append (
2026-04-03 22:16:02 -03:00
timeseries_panel (
2026-04-03 14:55:16 -03:00
41 ,
2026-04-03 22:16:02 -03:00
" UPS History (Power Draw) " ,
None ,
2026-04-12 23:02:03 -03:00
{ " h " : 5 , " w " : 6 , " x " : 6 , " y " : 7 } ,
2026-04-03 22:16:02 -03:00
unit = " watt " ,
targets = [
2026-04-08 23:33:17 -03:00
{ " refId " : " A " , " expr " : ANANKE_UPS_DRAW_WATTS_DB_SERIES , " legendFormat " : ANANKE_UPS_DB_NAME } ,
{ " refId " : " B " , " expr " : ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES , " legendFormat " : ANANKE_UPS_TETHYS_NAME } ,
{ " refId " : " C " , " expr " : ANANKE_UPS_DRAW_WATTS_TOTAL_SERIES , " legendFormat " : " combined " } ,
2026-04-03 22:16:02 -03:00
] ,
2026-04-12 18:14:54 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2026-04-03 14:55:16 -03:00
links = link_to ( " atlas-power " ) ,
)
)
panels . append (
2026-04-13 04:51:00 -03:00
canvas_two_metric_row_panel (
2026-04-03 14:55:16 -03:00
42 ,
2026-04-13 03:35:39 -03:00
" Current Enclosure Temperature " ,
{ " h " : 3 , " w " : 6 , " x " : 0 , " y " : 12 } ,
2026-04-13 04:51:00 -03:00
metric_size = 30 ,
2026-04-13 00:17:29 -03:00
targets = [
2026-04-13 03:52:36 -03:00
{ " refId " : " A " , " expr " : CLIMATE_TEMP_MAX , " legendFormat " : " °C " , " instant " : True } ,
{ " refId " : " B " , " expr " : CLIMATE_TEMP_FAHRENHEIT_MAX , " legendFormat " : " °F " , " instant " : True } ,
2026-04-13 00:17:29 -03:00
] ,
2026-04-03 22:16:02 -03:00
field_overrides = [
2026-04-13 03:52:36 -03:00
{ " matcher " : { " id " : " byName " , " options " : " °C " } , " properties " : [ { " id " : " unit " , " value " : " celsius " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " °F " } , " properties " : [ { " id " : " unit " , " value " : " fahrenheit " } ] } ,
2026-04-03 22:16:02 -03:00
] ,
links = link_to ( " atlas-power " ) ,
)
)
panels . append (
2026-04-13 04:51:00 -03:00
canvas_two_metric_row_panel (
2026-04-13 03:35:39 -03:00
143 ,
" Current Enclosure Climate " ,
{ " h " : 3 , " w " : 6 , " x " : 0 , " y " : 15 } ,
2026-04-13 04:51:00 -03:00
metric_size = 30 ,
2026-04-13 03:35:39 -03:00
targets = [
2026-04-13 03:52:36 -03:00
{ " refId " : " A " , " expr " : CLIMATE_HUMIDITY_MAX , " legendFormat " : " % RH " , " instant " : True } ,
{ " refId " : " B " , " expr " : CLIMATE_PRESSURE_CURRENT , " legendFormat " : " kPa " , " instant " : True } ,
2026-04-13 03:35:39 -03:00
] ,
field_overrides = [
2026-04-13 03:52:36 -03:00
{ " matcher " : { " id " : " byName " , " options " : " % RH " } , " properties " : [ { " id " : " unit " , " value " : " suffix: % RH " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " kPa " } , " properties " : [ { " id " : " unit " , " value " : " suffix:kPa " } ] } ,
2026-04-13 03:35:39 -03:00
] ,
links = link_to ( " atlas-power " ) ,
)
)
climate_history_panel = timeseries_panel (
2026-04-03 22:16:02 -03:00
43 ,
2026-04-13 01:08:58 -03:00
" Enclosure Climate History " ,
2026-04-03 22:16:02 -03:00
None ,
2026-04-12 23:02:03 -03:00
{ " h " : 6 , " w " : 6 , " x " : 6 , " y " : 12 } ,
2026-04-12 22:53:23 -03:00
unit = " none " ,
2026-04-03 22:16:02 -03:00
targets = [
2026-04-12 18:35:15 -03:00
{ " refId " : " A " , " expr " : CLIMATE_TEMP_SERIES , " legendFormat " : " C " } ,
2026-04-12 18:50:25 -03:00
{ " refId " : " B " , " expr " : CLIMATE_HUMIDITY_SERIES , " legendFormat " : " RH " } ,
{ " refId " : " C " , " expr " : CLIMATE_PRESSURE_SERIES , " legendFormat " : " P " } ,
2026-04-13 00:17:29 -03:00
{ " refId " : " D " , " expr " : CLIMATE_TEMP_MIN_BOUND_SERIES , " legendFormat " : " C bound min " } ,
{ " refId " : " E " , " expr " : CLIMATE_TEMP_MAX_BOUND_SERIES , " legendFormat " : " C bound max " } ,
{ " refId " : " F " , " expr " : CLIMATE_HUMIDITY_MIN_BOUND_SERIES , " legendFormat " : " RH bound min " } ,
{ " refId " : " G " , " expr " : CLIMATE_HUMIDITY_MAX_BOUND_SERIES , " legendFormat " : " RH bound max " } ,
{ " refId " : " H " , " expr " : CLIMATE_PRESSURE_MIN_BOUND_SERIES , " legendFormat " : " P bound min " } ,
{ " refId " : " I " , " expr " : CLIMATE_PRESSURE_MAX_BOUND_SERIES , " legendFormat " : " P bound max " } ,
2026-04-03 22:16:02 -03:00
] ,
field_overrides = [
2026-04-12 22:53:23 -03:00
{
" matcher " : { " id " : " byName " , " options " : " C " } ,
" properties " : [
{ " id " : " unit " , " value " : " suffix:°C " } ,
{ " id " : " decimals " , " value " : 2 } ,
{ " id " : " custom.axisPlacement " , " value " : " left " } ,
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
] ,
} ,
2026-04-13 00:17:29 -03:00
{
" matcher " : { " id " : " byRegexp " , " options " : " C bound .* " } ,
" properties " : [
{ " id " : " unit " , " value " : " suffix:°C " } ,
{ " id " : " custom.axisPlacement " , " value " : " left " } ,
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
{ " id " : " custom.hideFrom " , " value " : { " legend " : True , " tooltip " : True , " viz " : False } } ,
{ " id " : " custom.lineWidth " , " value " : 0 } ,
{ " id " : " custom.fillOpacity " , " value " : 0 } ,
{ " id " : " custom.showPoints " , " value " : " never " } ,
{ " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " transparent " } } ,
] ,
} ,
2026-04-03 22:16:02 -03:00
{
2026-04-12 18:35:15 -03:00
" matcher " : { " id " : " byName " , " options " : " RH " } ,
2026-04-12 17:28:15 -03:00
" properties " : [
2026-04-12 22:53:23 -03:00
{ " id " : " unit " , " value " : " suffix: % " } ,
{ " id " : " decimals " , " value " : 2 } ,
2026-04-12 17:56:54 -03:00
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
2026-04-12 22:53:23 -03:00
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
2026-04-12 17:28:15 -03:00
] ,
} ,
2026-04-13 00:17:29 -03:00
{
" matcher " : { " id " : " byRegexp " , " options " : " RH bound .* " } ,
" properties " : [
{ " id " : " unit " , " value " : " suffix: % " } ,
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
{ " id " : " custom.hideFrom " , " value " : { " legend " : True , " tooltip " : True , " viz " : False } } ,
{ " id " : " custom.lineWidth " , " value " : 0 } ,
{ " id " : " custom.fillOpacity " , " value " : 0 } ,
{ " id " : " custom.showPoints " , " value " : " never " } ,
{ " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " transparent " } } ,
] ,
} ,
2026-04-12 17:28:15 -03:00
{
2026-04-12 18:50:25 -03:00
" matcher " : { " id " : " byName " , " options " : " P " } ,
2026-04-03 22:16:02 -03:00
" properties " : [
2026-04-12 17:56:54 -03:00
{ " id " : " unit " , " value " : " suffix:kPa " } ,
2026-04-03 22:16:02 -03:00
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
{ " id " : " decimals " , " value " : 2 } ,
2026-04-12 22:53:23 -03:00
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
2026-04-03 22:16:02 -03:00
] ,
2026-04-13 00:17:29 -03:00
} ,
{
" matcher " : { " id " : " byRegexp " , " options " : " P bound .* " } ,
" properties " : [
{ " id " : " unit " , " value " : " suffix:kPa " } ,
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
{ " id " : " custom.hideFrom " , " value " : { " legend " : True , " tooltip " : True , " viz " : False } } ,
{ " id " : " custom.lineWidth " , " value " : 0 } ,
{ " id " : " custom.fillOpacity " , " value " : 0 } ,
{ " id " : " custom.showPoints " , " value " : " never " } ,
{ " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " transparent " } } ,
] ,
} ,
2026-04-03 22:16:02 -03:00
] ,
2026-04-12 18:35:15 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
2026-04-03 14:55:16 -03:00
links = link_to ( " atlas-power " ) ,
2026-04-13 02:26:09 -03:00
description = " Temperature on left axis, humidity and pressure on right axis with dynamic bound series so small swings remain visible. " ,
2026-04-03 22:16:02 -03:00
)
2026-04-13 03:35:39 -03:00
climate_history_panel [ " fieldConfig " ] [ " defaults " ] [ " custom " ] = {
" drawStyle " : " line " ,
" lineInterpolation " : " linear " ,
" lineWidth " : 2 ,
" fillOpacity " : 10 ,
" showPoints " : " never " ,
" spanNulls " : True ,
}
panels . append ( climate_history_panel )
2026-04-03 22:16:02 -03:00
panels . append (
2026-04-12 19:56:12 -03:00
stat_panel (
2026-04-03 22:16:02 -03:00
140 ,
" Fan Activity " ,
2026-04-12 19:46:39 -03:00
CLIMATE_FAN_CURRENT_ROW_EXPR ,
2026-04-12 18:14:54 -03:00
{ " h " : 6 , " w " : 6 , " x " : 12 , " y " : 12 } ,
2026-04-12 19:56:12 -03:00
unit = " none " ,
decimals = 0 ,
text_mode = " name_and_value " ,
legend = " {{ metric}} " ,
2026-04-12 19:46:39 -03:00
instant = True ,
2026-04-12 19:56:12 -03:00
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 7 } ,
{ " color " : " red " , " value " : 9 } ,
] ,
} ,
2026-04-12 19:46:39 -03:00
field_overrides = [
{ " matcher " : { " id " : " byName " , " options " : " Outlet " } , " properties " : [ { " id " : " decimals " , " value " : 0 } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Inlet In " } , " properties " : [ { " id " : " decimals " , " value " : 0 } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Inlet Out " } , " properties " : [ { " id " : " decimals " , " value " : 0 } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Interior " } , " properties " : [ { " id " : " decimals " , " value " : 0 } ] } ,
2026-04-03 22:16:02 -03:00
] ,
links = link_to ( " atlas-power " ) ,
2026-04-12 22:14:59 -03:00
orientation = " vertical " ,
2026-04-12 19:56:12 -03:00
wide_layout = False ,
2026-04-03 14:55:16 -03:00
)
)
2026-04-03 22:16:02 -03:00
panels . append (
timeseries_panel (
141 ,
" Fan History (0-10) " ,
None ,
2026-04-12 18:14:54 -03:00
{ " h " : 6 , " w " : 6 , " x " : 18 , " y " : 12 } ,
2026-04-03 22:16:02 -03:00
unit = " none " ,
max_value = 10 ,
targets = [
2026-04-09 20:10:52 -03:00
{ " refId " : " A " , " expr " : CLIMATE_FAN_OUTLET_SERIES , " legendFormat " : " Inside Outlet " } ,
{ " refId " : " B " , " expr " : CLIMATE_FAN_INSIDE_INLET_SERIES , " legendFormat " : " Inside Inlet " } ,
{ " refId " : " C " , " expr " : CLIMATE_FAN_OUTSIDE_INLET_SERIES , " legendFormat " : " Outside Inlet " } ,
{ " refId " : " D " , " expr " : CLIMATE_FAN_INTERIOR_SERIES , " legendFormat " : " Interior Fans " } ,
2026-04-03 22:16:02 -03:00
] ,
2026-04-12 18:14:54 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2026-04-03 22:16:02 -03:00
links = link_to ( " atlas-power " ) ,
)
2026-04-03 14:55:16 -03:00
)
panels . append (
2026-04-09 16:35:14 -03:00
bargauge_panel (
2026-04-03 14:55:16 -03:00
44 ,
2026-04-09 16:35:14 -03:00
" One-off Job Pods (age hours) " ,
ONEOFF_JOB_POD_AGE_HOURS ,
2026-04-12 18:14:54 -03:00
{ " h " : 5 , " w " : 8 , " x " : 0 , " y " : 32 } ,
2026-04-04 01:33:15 -03:00
unit = " h " ,
2026-04-03 14:55:16 -03:00
instant = True ,
2026-04-09 16:35:14 -03:00
legend = " {{ namespace}}/ {{ pod}} " ,
thresholds = age_thresholds ,
limit = 12 ,
decimals = 2 ,
links = link_to ( " atlas-jobs " ) ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-09 16:35:14 -03:00
{
" id " : 45 ,
" type " : " timeseries " ,
" title " : " Ariadne Attempts / Failures " ,
" datasource " : PROM_DS ,
2026-04-12 18:14:54 -03:00
" gridPos " : { " h " : 5 , " w " : 6 , " x " : 12 , " y " : 7 } ,
2026-04-09 16:35:14 -03:00
" targets " : [
{ " expr " : ARIADNE_TASK_ATTEMPTS_SERIES , " refId " : " A " , " legendFormat " : " Attempts " } ,
{ " expr " : ARIADNE_TASK_FAILURES_SERIES , " refId " : " B " , " legendFormat " : " Failures " } ,
] ,
" fieldConfig " : {
" defaults " : { " unit " : " none " } ,
" overrides " : [
{
" matcher " : { " id " : " byName " , " options " : " Attempts " } ,
" properties " : [
{ " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " green " } }
] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : " Failures " } ,
" properties " : [
{ " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " red " } }
] ,
} ,
] ,
} ,
" options " : {
" legend " : { " displayMode " : " table " , " placement " : " right " } ,
" tooltip " : { " mode " : " multi " } ,
} ,
" links " : link_to ( " atlas-jobs " ) ,
}
2026-04-03 14:55:16 -03:00
)
2026-04-08 23:33:17 -03:00
test_success = timeseries_panel (
2026-04-03 14:55:16 -03:00
46 ,
" Platform Test Success Rate " ,
2026-04-08 23:33:17 -03:00
None ,
2026-04-12 18:14:54 -03:00
{ " h " : 5 , " w " : 6 , " x " : 18 , " y " : 7 } ,
2026-04-03 14:55:16 -03:00
unit = " percent " ,
2026-04-09 15:21:59 -03:00
targets = PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS ,
2026-04-09 13:39:55 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2026-04-09 19:27:48 -03:00
legend_calcs = [ " lastNotNull " ] ,
2026-04-12 20:05:39 -03:00
links = link_to ( " atlas-testing " ) ,
2026-04-03 14:55:16 -03:00
)
2026-04-09 14:56:43 -03:00
test_success [ " fieldConfig " ] [ " defaults " ] [ " min " ] = 0
test_success [ " fieldConfig " ] [ " defaults " ] [ " max " ] = 100
2026-04-09 16:35:14 -03:00
test_success [ " fieldConfig " ] [ " defaults " ] [ " custom " ] = {
" drawStyle " : " line " ,
" lineInterpolation " : " linear " ,
" lineWidth " : 2 ,
" fillOpacity " : 10 ,
" showPoints " : " always " ,
" pointSize " : 4 ,
" spanNulls " : True ,
}
2026-04-09 20:05:10 -03:00
test_success [ " timeFrom " ] = " 7d "
2026-04-03 14:55:16 -03:00
test_success [ " description " ] = (
2026-04-09 20:05:10 -03:00
" Per-run interval pass points (0-100) for each software suite over the last 7 days. Points are connected to show trend; missing-run intervals are ignored. "
2026-04-03 14:55:16 -03:00
)
panels . append ( test_success )
2026-04-09 19:27:48 -03:00
panels . append (
2026-04-09 20:16:44 -03:00
bargauge_panel (
2026-04-09 19:27:48 -03:00
47 ,
2026-04-11 11:54:43 -03:00
" PVC Backup Health / Age " ,
PVC_BACKUP_AGE_HOURS_BY_PVC ,
2026-04-13 05:31:26 -03:00
{ " h " : 5 , " w " : 24 , " x " : 0 , " y " : 56 } ,
2026-04-11 11:54:43 -03:00
unit = " h " ,
2026-04-09 19:27:48 -03:00
instant = True ,
2026-04-11 11:54:43 -03:00
legend = " {{ namespace}}/ {{ pvc}} " ,
2026-04-09 20:16:44 -03:00
sort_order = " desc " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
2026-04-11 11:54:43 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 6 } ,
{ " color " : " orange " , " value " : 12 } ,
{ " color " : " red " , " value " : 24 } ,
2026-04-09 20:16:44 -03:00
] ,
} ,
2026-04-09 19:27:48 -03:00
)
2026-04-03 14:55:16 -03:00
)
2026-04-11 11:54:43 -03:00
panels [ - 1 ] [ " links " ] = link_to ( " atlas-storage " )
2026-04-09 20:16:44 -03:00
panels [ - 1 ] [ " description " ] = (
2026-04-13 05:33:28 -03:00
" Backup age in hours computed from last-success timestamps for restic-managed PVCs. "
" PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility. "
2026-04-09 20:16:44 -03:00
)
2026-04-12 18:14:54 -03:00
panels . append (
2026-04-13 00:25:33 -03:00
jenkins_weather_bargauge_panel (
142 ,
2026-04-13 05:31:26 -03:00
" Jenkins Last Run (hours ago, newest first) " ,
2026-04-13 00:25:33 -03:00
JENKINS_BUILD_WEATHER_LAST_RUN_AGE_HOURS ,
2026-04-13 05:31:26 -03:00
{ " h " : 5 , " w " : 5 , " x " : 8 , " y " : 32 } ,
2026-04-13 00:25:33 -03:00
unit = " h " ,
decimals = 2 ,
sort_order = " asc " ,
limit = 8 ,
thresholds = age_thresholds ,
links = link_to ( " atlas-jobs " ) ,
description = (
2026-04-13 05:31:26 -03:00
" Jenkins weather list from Ariadne: icon + name with status color and age since the most recent run. "
2026-04-13 00:25:33 -03:00
) ,
)
2026-04-12 18:14:54 -03:00
)
2026-04-13 05:31:26 -03:00
panels . append (
jenkins_weather_bargauge_panel (
243 ,
" Jenkins Last Success (hours ago) " ,
JENKINS_BUILD_WEATHER_LAST_SUCCESS_AGE_HOURS ,
{ " h " : 5 , " w " : 5 , " x " : 13 , " y " : 32 } ,
unit = " h " ,
decimals = 2 ,
sort_order = " asc " ,
limit = 8 ,
thresholds = age_thresholds ,
links = link_to ( " atlas-jobs " ) ,
description = " Per-job age since the most recent successful run. " ,
)
)
panels . append (
jenkins_weather_bargauge_panel (
244 ,
" Jenkins Last Duration (minutes) " ,
JENKINS_BUILD_WEATHER_LAST_DURATION_MINUTES ,
{ " h " : 5 , " w " : 6 , " x " : 18 , " y " : 32 } ,
unit = " m " ,
decimals = 2 ,
sort_order = " desc " ,
limit = 8 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 5 } ,
{ " color " : " orange " , " value " : 15 } ,
{ " color " : " red " , " value " : 30 } ,
] ,
} ,
links = link_to ( " atlas-jobs " ) ,
description = " Most recent completed build duration per Jenkins job. " ,
)
)
2026-04-03 14:55:16 -03:00
2026-01-06 02:34:52 -03:00
panels . append (
stat_panel (
2026-01-05 21:55:59 -03:00
30 ,
2026-01-06 02:34:52 -03:00
" Mail Sent (1d) " ,
' max(postmark_outbound_sent { window= " 1d " }) ' ,
2026-04-09 13:39:55 -03:00
{ " h " : 2 , " w " : 4 , " x " : 0 , " y " : 18 } ,
2026-01-06 02:34:52 -03:00
unit = " none " ,
links = link_to ( " atlas-mail " ) ,
)
)
panels . append (
{
" id " : 31 ,
" type " : " stat " ,
" title " : " Mail Bounces (1d) " ,
" datasource " : PROM_DS ,
2026-04-09 13:39:55 -03:00
" gridPos " : { " h " : 2 , " w " : 4 , " x " : 8 , " y " : 18 } ,
2026-01-06 02:34:52 -03:00
" targets " : [
{
" expr " : ' max(postmark_outbound_bounce_rate { window= " 1d " }) ' ,
" refId " : " A " ,
" legendFormat " : " Rate " ,
} ,
{
" expr " : ' max(postmark_outbound_bounced { window= " 1d " }) ' ,
" refId " : " B " ,
" legendFormat " : " Count " ,
} ,
] ,
" fieldConfig " : {
" defaults " : {
" color " : { " mode " : " thresholds " } ,
" custom " : { " displayMode " : " auto " } ,
" thresholds " : mail_bounce_rate_thresholds ,
" unit " : " none " ,
} ,
" overrides " : [
{
" matcher " : { " id " : " byName " , " options " : " Rate " } ,
" properties " : [ { " id " : " unit " , " value " : " percent " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : " Count " } ,
" properties " : [ { " id " : " unit " , " value " : " none " } ] ,
} ,
] ,
} ,
" options " : {
" colorMode " : " value " ,
" graphMode " : " area " ,
" justifyMode " : " center " ,
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" textMode " : " name_and_value " ,
} ,
" links " : link_to ( " atlas-mail " ) ,
}
)
panels . append (
stat_panel (
2026-01-05 21:55:59 -03:00
32 ,
2026-01-06 02:34:52 -03:00
" Mail Success Rate (1d) " ,
' clamp_min(100 - max(postmark_outbound_bounce_rate { window= " 1d " }), 0) ' ,
2026-04-09 13:39:55 -03:00
{ " h " : 2 , " w " : 4 , " x " : 4 , " y " : 18 } ,
2026-01-06 02:34:52 -03:00
unit = " percent " ,
thresholds = mail_success_thresholds ,
decimals = 1 ,
links = link_to ( " atlas-mail " ) ,
)
)
panels . append (
stat_panel (
33 ,
2026-01-06 02:06:20 -03:00
" Mail Limit Used (30d) " ,
" max(postmark_sending_limit_used_percent) " ,
2026-04-09 13:39:55 -03:00
{ " h " : 2 , " w " : 4 , " x " : 12 , " y " : 18 } ,
2026-01-06 02:34:52 -03:00
unit = " percent " ,
thresholds = mail_limit_thresholds ,
decimals = 1 ,
links = link_to ( " atlas-mail " ) ,
2026-01-05 21:55:59 -03:00
)
2026-01-06 02:34:52 -03:00
)
2026-01-22 15:23:23 -03:00
panels . append (
2026-01-22 18:23:17 -03:00
stat_panel (
2026-01-22 15:23:23 -03:00
34 ,
" Postgres Connections Used " ,
2026-01-22 18:23:17 -03:00
POSTGRES_CONN_USED ,
2026-04-09 13:39:55 -03:00
{ " h " : 2 , " w " : 4 , " x " : 16 , " y " : 18 } ,
2026-01-22 18:23:17 -03:00
decimals = 0 ,
text_mode = " name_and_value " ,
legend = " {{ conn}} " ,
instant = True ,
2026-01-22 15:23:23 -03:00
)
)
panels . append (
stat_panel (
35 ,
" Postgres Hottest Connections " ,
POSTGRES_CONN_HOTTEST ,
2026-04-09 13:39:55 -03:00
{ " h " : 2 , " w " : 4 , " x " : 20 , " y " : 18 } ,
2026-01-22 15:23:23 -03:00
unit = " none " ,
decimals = 0 ,
text_mode = " name_and_value " ,
legend = " {{ datname}} " ,
instant = True ,
)
)
2026-01-05 21:55:59 -03:00
2026-01-01 14:44:33 -03:00
cpu_scope = " $namespace_scope_cpu "
gpu_scope = " $namespace_scope_gpu "
ram_scope = " $namespace_scope_ram "
2025-11-17 14:22:46 -03:00
panels . append (
2025-11-17 16:27:38 -03:00
pie_panel (
11 ,
2025-12-02 14:41:39 -03:00
" Namespace CPU Share " ,
2026-01-01 14:44:33 -03:00
namespace_cpu_share_expr ( cpu_scope ) ,
2026-04-04 01:33:15 -03:00
{ " h " : 9 , " w " : 8 , " x " : 0 , " y " : 23 } ,
2026-01-01 14:44:33 -03:00
links = namespace_scope_links ( " namespace_scope_cpu " ) ,
2026-01-18 02:50:07 -03:00
description = " Shares are normalized within the selected filter. Switching scope changes the denominator. " ,
2025-11-17 23:12:16 -03:00
)
)
panels . append (
pie_panel (
2025-11-17 23:42:55 -03:00
12 ,
2025-12-02 14:41:39 -03:00
" Namespace GPU Share " ,
2026-01-01 14:44:33 -03:00
namespace_gpu_share_expr ( gpu_scope ) ,
2026-04-04 01:33:15 -03:00
{ " h " : 9 , " w " : 8 , " x " : 8 , " y " : 23 } ,
2026-01-01 14:44:33 -03:00
links = namespace_scope_links ( " namespace_scope_gpu " ) ,
2026-01-18 02:50:07 -03:00
description = " Shares are normalized within the selected filter. Switching scope changes the denominator. " ,
2025-11-18 00:11:39 -03:00
)
)
panels . append (
pie_panel (
13 ,
2025-12-02 14:41:39 -03:00
" Namespace RAM Share " ,
2026-01-01 14:44:33 -03:00
namespace_ram_share_expr ( ram_scope ) ,
2026-04-04 01:33:15 -03:00
{ " h " : 9 , " w " : 8 , " x " : 16 , " y " : 23 } ,
2026-01-01 14:44:33 -03:00
links = namespace_scope_links ( " namespace_scope_ram " ) ,
2026-01-18 02:50:07 -03:00
description = " Shares are normalized within the selected filter. Switching scope changes the denominator. " ,
2025-11-17 14:22:46 -03:00
)
)
2025-11-17 21:48:12 -03:00
worker_filter = f " { WORKER_REGEX } "
2025-11-17 14:22:46 -03:00
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
14 ,
2025-12-02 14:41:39 -03:00
" Worker Node CPU " ,
2025-11-17 21:48:12 -03:00
node_cpu_expr ( worker_filter ) ,
2026-04-12 18:14:54 -03:00
{ " h " : 12 , " w " : 12 , " x " : 0 , " y " : 44 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 16:27:38 -03:00
links = link_to ( " atlas-nodes " ) ,
2025-11-17 14:22:46 -03:00
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
15 ,
2025-12-02 14:41:39 -03:00
" Worker Node RAM " ,
2025-11-17 21:48:12 -03:00
node_mem_expr ( worker_filter ) ,
2026-04-12 18:14:54 -03:00
{ " h " : 12 , " w " : 12 , " x " : 12 , " y " : 44 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 16:27:38 -03:00
links = link_to ( " atlas-nodes " ) ,
2025-11-17 14:22:46 -03:00
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
16 ,
2025-11-17 21:48:12 -03:00
" Control plane CPU " ,
2025-12-12 21:55:53 -03:00
node_cpu_expr ( CONTROL_ALL_REGEX ) ,
2026-04-12 18:14:54 -03:00
{ " h " : 10 , " w " : 12 , " x " : 0 , " y " : 56 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
2025-11-17 16:27:38 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 14:22:46 -03:00
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
17 ,
2025-11-17 21:48:12 -03:00
" Control plane RAM " ,
2025-12-12 21:55:53 -03:00
node_mem_expr ( CONTROL_ALL_REGEX ) ,
2026-04-12 18:14:54 -03:00
{ " h " : 10 , " w " : 12 , " x " : 12 , " y " : 56 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
2025-11-17 16:27:38 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 14:22:46 -03:00
)
)
2025-12-12 18:51:43 -03:00
panels . append (
pie_panel (
28 ,
2025-12-12 20:30:00 -03:00
" Node Pod Share " ,
2025-12-12 20:40:32 -03:00
' (sum(kube_pod_info { pod!= " " , node!= " " }) by (node) / clamp_min(sum(kube_pod_info { pod!= " " , node!= " " }), 1)) * 100 ' ,
2026-04-12 18:14:54 -03:00
{ " h " : 10 , " w " : 12 , " x " : 0 , " y " : 66 } ,
2025-12-12 18:51:43 -03:00
)
)
panels . append (
bargauge_panel (
29 ,
" Top Nodes by Pod Count " ,
2025-12-12 19:09:51 -03:00
' topk(12, sum(kube_pod_info { pod!= " " , node!= " " }) by (node)) ' ,
2026-04-12 18:14:54 -03:00
{ " h " : 10 , " w " : 12 , " x " : 12 , " y " : 66 } ,
2025-12-12 18:51:43 -03:00
unit = " none " ,
2025-12-12 18:56:13 -03:00
limit = 12 ,
2025-12-12 20:20:13 -03:00
decimals = 0 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 75 } ,
{ " color " : " red " , " value " : 100 } ,
] ,
} ,
2025-12-12 20:30:00 -03:00
instant = True ,
2025-12-12 18:51:43 -03:00
)
)
2025-11-17 14:22:46 -03:00
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
18 ,
2025-12-02 14:41:39 -03:00
" Cluster Ingress Throughput " ,
2025-11-17 16:27:38 -03:00
NET_INGRESS_EXPR ,
2026-04-12 18:14:54 -03:00
{ " h " : 7 , " w " : 8 , " x " : 0 , " y " : 37 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-18 14:08:33 -03:00
legend = " Ingress (Traefik) " ,
2025-11-17 16:27:38 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
links = link_to ( " atlas-network " ) ,
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
19 ,
2025-12-02 14:41:39 -03:00
" Cluster Egress Throughput " ,
2025-11-17 16:27:38 -03:00
NET_EGRESS_EXPR ,
2026-04-12 18:14:54 -03:00
{ " h " : 7 , " w " : 8 , " x " : 8 , " y " : 37 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-18 14:08:33 -03:00
legend = " Egress (Traefik) " ,
2025-11-17 16:27:38 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
links = link_to ( " atlas-network " ) ,
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
20 ,
2025-12-02 14:41:39 -03:00
" Intra-Cluster Throughput " ,
2025-11-18 14:08:33 -03:00
NET_INTERNAL_EXPR ,
2026-04-12 18:14:54 -03:00
{ " h " : 7 , " w " : 8 , " x " : 16 , " y " : 37 } ,
2025-11-18 14:08:33 -03:00
unit = " Bps " ,
legend = " Internal traffic " ,
legend_display = " list " ,
legend_placement = " bottom " ,
links = link_to ( " atlas-network " ) ,
)
)
panels . append (
timeseries_panel (
21 ,
2025-12-02 14:41:39 -03:00
" Root Filesystem Usage " ,
2025-11-17 14:22:46 -03:00
root_usage_expr ( ) ,
2026-04-12 18:14:54 -03:00
{ " h " : 16 , " w " : 12 , " x " : 0 , " y " : 76 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 16:27:38 -03:00
time_from = " 30d " ,
links = link_to ( " atlas-storage " ) ,
2025-11-17 14:22:46 -03:00
)
)
panels . append (
2026-04-12 04:26:52 -03:00
timeseries_panel (
2025-12-02 13:16:00 -03:00
22 ,
2026-04-11 11:54:43 -03:00
" Nodes Closest to Full Astraios Disks " ,
2026-04-12 04:26:52 -03:00
astraios_usage_expr ( ) ,
2026-04-12 18:14:54 -03:00
{ " h " : 16 , " w " : 12 , " x " : 12 , " y " : 76 } ,
2025-12-02 13:16:00 -03:00
unit = " percent " ,
2026-04-12 04:26:52 -03:00
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
time_from = " 1w " ,
2025-12-02 13:16:00 -03:00
links = link_to ( " atlas-storage " ) ,
)
2025-11-17 14:22:46 -03:00
)
return {
" uid " : " atlas-overview " ,
" title " : " Atlas Overview " ,
2025-11-17 16:27:38 -03:00
" folderUid " : PUBLIC_FOLDER ,
2025-11-17 14:22:46 -03:00
" editable " : False ,
2025-11-17 16:27:38 -03:00
" annotations " : { " list " : [ ] } ,
2025-11-17 14:22:46 -03:00
" panels " : panels ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " overview " ] ,
2026-01-01 14:44:33 -03:00
" templating " : {
" list " : [
namespace_scope_variable ( " namespace_scope_cpu " , " CPU namespace filter " ) ,
namespace_scope_variable ( " namespace_scope_gpu " , " GPU namespace filter " ) ,
namespace_scope_variable ( " namespace_scope_ram " , " RAM namespace filter " ) ,
]
} ,
2025-12-02 14:41:39 -03:00
" time " : { " from " : " now-1h " , " to " : " now " } ,
" refresh " : " 1m " ,
2025-12-12 18:32:45 -03:00
" links " : [ ] ,
2025-11-17 14:22:46 -03:00
}
def build_pods_dashboard ( ) :
panels = [ ]
panels . append (
2025-11-17 16:27:38 -03:00
stat_panel (
2025-11-17 14:22:46 -03:00
1 ,
2025-12-02 14:41:39 -03:00
" Problem Pods " ,
2025-11-17 16:27:38 -03:00
PROBLEM_PODS_EXPR ,
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
2 ,
" CrashLoop / ImagePull " ,
CRASHLOOP_EXPR ,
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
3 ,
2025-12-02 14:41:39 -03:00
" Stuck Terminating (>10m) " ,
2025-11-17 16:27:38 -03:00
STUCK_TERMINATING_EXPR ,
{ " h " : 4 , " w " : 6 , " x " : 12 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
4 ,
2025-12-02 14:41:39 -03:00
" Control Plane Workloads " ,
2025-11-17 16:27:38 -03:00
f ' sum(kube_pod_info {{ node=~ " { CONTROL_REGEX } " ,namespace!~ " { CP_ALLOWED_NS } " }} ) ' ,
{ " h " : 4 , " w " : 6 , " x " : 18 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
table_panel (
5 ,
2025-12-02 14:41:39 -03:00
" Pods Not Running " ,
2025-11-17 16:27:38 -03:00
PROBLEM_TABLE_EXPR ,
{ " h " : 10 , " w " : 24 , " x " : 0 , " y " : 4 } ,
2025-11-17 14:22:46 -03:00
unit = " s " ,
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
panels . append (
table_panel (
2025-11-17 16:27:38 -03:00
6 ,
2025-11-17 14:22:46 -03:00
" CrashLoop / ImagePull " ,
2025-11-17 16:27:38 -03:00
CRASHLOOP_TABLE_EXPR ,
{ " h " : 10 , " w " : 24 , " x " : 0 , " y " : 14 } ,
2025-11-17 14:22:46 -03:00
unit = " s " ,
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
panels . append (
table_panel (
2025-11-17 16:27:38 -03:00
7 ,
" Terminating >10m " ,
STUCK_TABLE_EXPR ,
{ " h " : 10 , " w " : 24 , " x " : 0 , " y " : 24 } ,
2025-11-17 14:22:46 -03:00
unit = " s " ,
transformations = [
2025-11-17 16:27:38 -03:00
{ " id " : " labelsToFields " , " options " : { } } ,
2025-11-17 14:22:46 -03:00
{ " id " : " filterByValue " , " options " : { " match " : " Value " , " operator " : " gt " , " value " : 600 } } ,
] ,
)
)
2025-12-12 18:32:45 -03:00
panels . append (
pie_panel (
8 ,
2025-12-12 20:30:00 -03:00
" Node Pod Share " ,
2025-12-12 20:40:32 -03:00
' (sum(kube_pod_info { pod!= " " , node!= " " }) by (node) / clamp_min(sum(kube_pod_info { pod!= " " , node!= " " }), 1)) * 100 ' ,
2025-12-12 18:32:45 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 34 } ,
)
)
2025-12-12 18:45:29 -03:00
panels . append (
bargauge_panel (
9 ,
" Top Nodes by Pod Count " ,
2025-12-12 19:09:51 -03:00
' topk(12, sum(kube_pod_info { pod!= " " , node!= " " }) by (node)) ' ,
2025-12-12 18:45:29 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 34 } ,
unit = " none " ,
2025-12-12 18:56:13 -03:00
limit = 12 ,
2025-12-12 20:20:13 -03:00
decimals = 0 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 75 } ,
{ " color " : " red " , " value " : 100 } ,
] ,
} ,
2025-12-12 20:30:00 -03:00
instant = True ,
2025-12-12 18:45:29 -03:00
)
)
2025-12-13 15:51:45 -03:00
2025-12-13 16:36:25 -03:00
share_expr = (
2025-12-13 18:23:19 -03:00
' (sum by (namespace,node) (kube_pod_info { pod!= " " , node!= " " }) '
2025-12-13 17:29:55 -03:00
' / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info { pod!= " " }), 1) * 100) '
2025-12-13 16:36:25 -03:00
)
2025-12-13 19:04:22 -03:00
rank_terms = [
f " (sum by (node) (kube_node_info {{ node= \" { node } \" }} ) * 0 + { idx * 1e-3 } ) "
2025-12-13 18:23:19 -03:00
for idx , node in enumerate ( CONTROL_ALL + WORKER_NODES , start = 1 )
2025-12-13 19:04:22 -03:00
]
rank_expr = " or " . join ( rank_terms )
2025-12-13 18:23:19 -03:00
score_expr = f " { share_expr } + on(node) group_left() ( { rank_expr } ) "
2025-12-13 17:29:55 -03:00
mask_expr = (
2025-12-13 18:39:31 -03:00
f " { score_expr } == bool on(namespace) group_left() "
f " (max by (namespace) ( { score_expr } )) "
2025-12-13 17:19:03 -03:00
)
2025-12-13 03:57:20 -03:00
panels . append (
table_panel (
10 ,
2025-12-13 22:17:47 -03:00
" Namespace Plurality by Node v27 " ,
2025-12-13 17:19:03 -03:00
(
2025-12-13 17:29:55 -03:00
f " { share_expr } * on(namespace,node) group_left() "
2025-12-13 18:25:03 -03:00
f " ( { mask_expr } ) "
2025-12-13 17:19:03 -03:00
) ,
2025-12-13 03:57:20 -03:00
{ " h " : 8 , " w " : 24 , " x " : 0 , " y " : 42 } ,
unit = " percent " ,
2025-12-13 18:23:19 -03:00
transformations = [
{ " id " : " labelsToFields " , " options " : { } } ,
{ " id " : " organize " , " options " : { " excludeByName " : { " Time " : True } } } ,
2025-12-13 18:25:03 -03:00
{ " id " : " filterByValue " , " options " : { " match " : " Value " , " operator " : " gt " , " value " : 0 } } ,
2025-12-13 18:23:19 -03:00
{
" id " : " sortBy " ,
2025-12-13 22:17:47 -03:00
" options " : { " fields " : [ " Value " ] , " order " : " desc " } ,
} ,
{
" id " : " groupBy " ,
" options " : {
" fields " : {
" namespace " : {
" aggregations " : [
{ " field " : " Value " , " operation " : " max " } ,
{ " field " : " node " , " operation " : " first " } ,
]
}
} ,
" rowBy " : [ " namespace " ] ,
} ,
2025-12-13 18:23:19 -03:00
} ,
] ,
2025-12-13 04:00:57 -03:00
instant = True ,
2025-12-13 17:32:19 -03:00
options = { " showColumnFilters " : False } ,
2025-12-13 17:55:52 -03:00
filterable = False ,
2025-12-13 18:03:51 -03:00
footer = { " show " : False , " fields " : " " , " calcs " : [ ] } ,
2025-12-13 18:23:19 -03:00
format = " table " ,
2025-12-13 03:57:20 -03:00
)
)
2025-12-13 15:51:45 -03:00
2025-11-17 14:22:46 -03:00
return {
" uid " : " atlas-pods " ,
" title " : " Atlas Pods " ,
2025-11-17 16:27:38 -03:00
" folderUid " : PRIVATE_FOLDER ,
2025-11-17 14:22:46 -03:00
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " pods " ] ,
}
def build_nodes_dashboard ( ) :
panels = [ ]
2025-11-17 16:27:38 -03:00
panels . append (
stat_panel (
1 ,
2025-12-02 14:41:39 -03:00
" Worker Nodes Ready " ,
2025-11-17 16:27:38 -03:00
f ' sum(kube_node_status_condition {{ condition= " Ready " ,status= " true " ,node=~ " { WORKER_REGEX } " }} ) ' ,
{ " h " : 4 , " w " : 8 , " x " : 0 , " y " : 0 } ,
value_suffix = WORKER_SUFFIX ,
)
)
panels . append (
stat_panel (
2 ,
2025-12-02 14:41:39 -03:00
" Control Plane Ready " ,
2025-11-17 16:27:38 -03:00
f ' sum(kube_node_status_condition {{ condition= " Ready " ,status= " true " ,node=~ " { CONTROL_REGEX } " }} ) ' ,
{ " h " : 4 , " w " : 8 , " x " : 8 , " y " : 0 } ,
value_suffix = CONTROL_SUFFIX ,
)
)
panels . append (
stat_panel (
3 ,
2025-12-02 14:41:39 -03:00
" Control Plane Workloads " ,
2025-11-17 16:27:38 -03:00
f ' sum(kube_pod_info {{ node=~ " { CONTROL_REGEX } " ,namespace!~ " { CP_ALLOWED_NS } " }} ) ' ,
{ " h " : 4 , " w " : 8 , " x " : 16 , " y " : 0 } ,
)
)
2025-12-12 18:00:43 -03:00
panels . append (
stat_panel (
9 ,
" API Server 5xx rate " ,
APISERVER_5XX_RATE ,
{ " h " : 4 , " w " : 8 , " x " : 0 , " y " : 4 } ,
unit = " req/s " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 0.05 } ,
{ " color " : " orange " , " value " : 0.2 } ,
{ " color " : " red " , " value " : 0.5 } ,
] ,
} ,
decimals = 3 ,
)
)
panels . append (
stat_panel (
10 ,
" API Server P99 latency " ,
APISERVER_P99_LATENCY_MS ,
{ " h " : 4 , " w " : 8 , " x " : 8 , " y " : 4 } ,
unit = " ms " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 250 } ,
{ " color " : " orange " , " value " : 400 } ,
{ " color " : " red " , " value " : 600 } ,
] ,
} ,
decimals = 1 ,
)
)
panels . append (
stat_panel (
11 ,
" etcd P99 latency " ,
ETCD_P99_LATENCY_MS ,
{ " h " : 4 , " w " : 8 , " x " : 16 , " y " : 4 } ,
unit = " ms " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 100 } ,
{ " color " : " red " , " value " : 200 } ,
] ,
} ,
decimals = 1 ,
)
)
2025-11-17 16:27:38 -03:00
panels . append (
timeseries_panel (
4 ,
" Node CPU " ,
node_cpu_expr ( ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 24 , " x " : 0 , " y " : 8 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
5 ,
" Node RAM " ,
node_mem_expr ( ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 24 , " x " : 0 , " y " : 17 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
6 ,
2025-12-02 14:41:39 -03:00
" Control Plane (incl. titan-db) CPU " ,
2025-11-17 16:27:38 -03:00
node_cpu_expr ( CONTROL_ALL_REGEX ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 26 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
7 ,
2025-12-02 14:41:39 -03:00
" Control Plane (incl. titan-db) RAM " ,
2025-11-17 16:27:38 -03:00
node_mem_expr ( CONTROL_ALL_REGEX ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 26 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
8 ,
2025-12-02 14:41:39 -03:00
" Root Filesystem Usage " ,
2025-11-17 16:27:38 -03:00
root_usage_expr ( ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 24 , " x " : 0 , " y " : 35 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
2026-04-11 11:54:43 -03:00
time_from = " 30d " ,
)
)
panels . append (
timeseries_panel (
9 ,
" Astraios Usage " ,
astraios_usage_expr ( ) ,
{ " h " : 9 , " w " : 24 , " x " : 0 , " y " : 44 } ,
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 16:27:38 -03:00
time_from = " 30d " ,
)
)
2025-11-17 14:22:46 -03:00
return {
" uid " : " atlas-nodes " ,
" title " : " Atlas Nodes " ,
2025-11-17 16:27:38 -03:00
" folderUid " : PRIVATE_FOLDER ,
2025-11-17 14:22:46 -03:00
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " nodes " ] ,
}
def build_storage_dashboard ( ) :
panels = [ ]
2025-11-17 16:27:38 -03:00
panels . append (
stat_panel (
1 ,
2025-12-02 14:41:39 -03:00
" Astreae Usage " ,
2025-11-17 16:27:38 -03:00
astreae_usage_expr ( " /mnt/astreae " ) ,
{ " h " : 5 , " w " : 6 , " x " : 0 , " y " : 0 } ,
unit = " percent " ,
thresholds = PERCENT_THRESHOLDS ,
)
)
panels . append (
stat_panel (
2 ,
2025-12-02 14:41:39 -03:00
" Asteria Usage " ,
2025-11-17 16:27:38 -03:00
astreae_usage_expr ( " /mnt/asteria " ) ,
{ " h " : 5 , " w " : 6 , " x " : 6 , " y " : 0 } ,
unit = " percent " ,
thresholds = PERCENT_THRESHOLDS ,
)
)
panels . append (
stat_panel (
3 ,
2025-12-02 14:41:39 -03:00
" Astreae Free " ,
2025-11-17 16:27:38 -03:00
astreae_free_expr ( " /mnt/astreae " ) ,
{ " h " : 5 , " w " : 6 , " x " : 12 , " y " : 0 } ,
2025-11-17 18:55:11 -03:00
unit = " decbytes " ,
2025-11-17 16:27:38 -03:00
)
)
panels . append (
stat_panel (
4 ,
2025-12-02 14:41:39 -03:00
" Asteria Free " ,
2025-11-17 16:27:38 -03:00
astreae_free_expr ( " /mnt/asteria " ) ,
{ " h " : 5 , " w " : 6 , " x " : 18 , " y " : 0 } ,
2025-11-17 18:55:11 -03:00
unit = " decbytes " ,
2025-11-17 16:27:38 -03:00
)
)
panels . append (
timeseries_panel (
5 ,
2025-12-02 14:41:39 -03:00
" Astreae Per-Node Usage " ,
2025-11-17 18:55:11 -03:00
filesystem_usage_expr ( " /mnt/astreae " , LONGHORN_NODE_REGEX ) ,
2025-11-17 16:27:38 -03:00
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 5 } ,
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
time_from = " 30d " ,
)
)
panels . append (
timeseries_panel (
6 ,
2025-12-02 14:41:39 -03:00
" Asteria Per-Node Usage " ,
2025-11-17 18:55:11 -03:00
filesystem_usage_expr ( " /mnt/asteria " , LONGHORN_NODE_REGEX ) ,
2025-11-17 16:27:38 -03:00
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 5 } ,
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
time_from = " 30d " ,
)
)
panels . append (
timeseries_panel (
7 ,
2025-12-02 14:41:39 -03:00
" Astreae Usage History " ,
2025-11-17 16:27:38 -03:00
astreae_usage_expr ( " /mnt/astreae " ) ,
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 14 } ,
unit = " percent " ,
time_from = " 90d " ,
)
)
panels . append (
timeseries_panel (
8 ,
2025-12-02 14:41:39 -03:00
" Asteria Usage History " ,
2025-11-17 16:27:38 -03:00
astreae_usage_expr ( " /mnt/asteria " ) ,
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 14 } ,
unit = " percent " ,
time_from = " 90d " ,
)
)
2026-01-11 23:46:24 -03:00
panels . append (
stat_panel (
30 ,
" Maintenance Sweepers Ready " ,
' kube_daemonset_status_number_ready { namespace= " maintenance " ,daemonset= " node-image-sweeper " } / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled { namespace= " maintenance " ,daemonset= " node-image-sweeper " } * 100 ' ,
{ " h " : 4 , " w " : 12 , " x " : 0 , " y " : 44 } ,
unit = " percent " ,
thresholds = PERCENT_THRESHOLDS ,
)
)
panels . append (
stat_panel (
31 ,
" Maintenance Cron Freshness (s) " ,
2026-01-14 06:41:34 -03:00
' time() - max by (cronjob) (kube_cronjob_status_last_successful_time { namespace= " maintenance " ,cronjob= " image-sweeper " }) ' ,
2026-01-11 23:46:24 -03:00
{ " h " : 4 , " w " : 12 , " x " : 12 , " y " : 44 } ,
unit = " s " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 3600 } ,
{ " color " : " red " , " value " : 10800 } ,
] ,
} ,
)
)
2025-11-17 14:22:46 -03:00
return {
" uid " : " atlas-storage " ,
" title " : " Atlas Storage " ,
2025-11-17 16:27:38 -03:00
" folderUid " : PRIVATE_FOLDER ,
2025-11-17 14:22:46 -03:00
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " storage " ] ,
}
2025-11-17 16:27:38 -03:00
def build_network_dashboard ( ) :
panels = [ ]
panels . append (
2025-12-02 13:16:00 -03:00
stat_panel (
1 ,
2025-12-12 18:00:43 -03:00
" Ingress Success Rate (5m) " ,
TRAEFIK_SLI_5M ,
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 0 } ,
unit = " percentunit " ,
decimals = 2 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " orange " , " value " : 0.995 } ,
{ " color " : " yellow " , " value " : 0.999 } ,
{ " color " : " green " , " value " : 0.9995 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
2 ,
" Error Budget Burn (1h) " ,
traefik_burn ( " 1h " ) ,
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 4 } ,
] ,
} ,
decimals = 2 ,
)
)
panels . append (
stat_panel (
3 ,
" Error Budget Burn (6h) " ,
traefik_burn ( " 6h " ) ,
{ " h " : 4 , " w " : 6 , " x " : 12 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 4 } ,
] ,
} ,
decimals = 2 ,
)
)
panels . append (
stat_panel (
4 ,
" Edge P99 Latency (ms) " ,
TRAEFIK_P99_LATENCY_MS ,
{ " h " : 4 , " w " : 6 , " x " : 18 , " y " : 0 } ,
unit = " ms " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 200 } ,
{ " color " : " orange " , " value " : 350 } ,
{ " color " : " red " , " value " : 500 } ,
] ,
} ,
decimals = 1 ,
)
)
panels . append (
stat_panel (
5 ,
2025-12-02 14:41:39 -03:00
" Ingress Traffic " ,
2025-12-02 13:16:00 -03:00
NET_INGRESS_EXPR ,
2025-12-12 18:00:43 -03:00
{ " h " : 4 , " w " : 8 , " x " : 0 , " y " : 4 } ,
2025-12-02 13:16:00 -03:00
unit = " Bps " ,
)
2025-11-17 16:27:38 -03:00
)
panels . append (
2025-12-02 13:16:00 -03:00
stat_panel (
2025-12-12 18:00:43 -03:00
6 ,
2025-12-02 14:41:39 -03:00
" Egress Traffic " ,
2025-12-02 13:16:00 -03:00
NET_EGRESS_EXPR ,
2025-12-12 18:00:43 -03:00
{ " h " : 4 , " w " : 8 , " x " : 8 , " y " : 4 } ,
2025-12-02 13:16:00 -03:00
unit = " Bps " ,
)
2025-11-17 16:27:38 -03:00
)
2025-11-18 14:08:33 -03:00
panels . append (
2025-12-02 13:16:00 -03:00
stat_panel (
2025-12-12 18:00:43 -03:00
7 ,
2025-12-02 14:41:39 -03:00
" Intra-Cluster Traffic " ,
2025-12-02 13:16:00 -03:00
NET_INTERNAL_EXPR ,
2025-12-12 18:00:43 -03:00
{ " h " : 4 , " w " : 8 , " x " : 16 , " y " : 4 } ,
2025-12-02 13:16:00 -03:00
unit = " Bps " ,
)
2025-11-18 14:08:33 -03:00
)
2025-11-17 16:27:38 -03:00
panels . append (
timeseries_panel (
2025-12-12 18:00:43 -03:00
8 ,
2025-12-02 14:41:39 -03:00
" Per-Node Throughput " ,
2025-12-02 13:16:00 -03:00
f ' avg by (node) (( { NET_NODE_TX_PHYS } + { NET_NODE_RX_PHYS } ) * on(instance) group_left(node) { NODE_INFO } ) ' ,
2025-11-18 14:08:33 -03:00
{ " h " : 8 , " w " : 24 , " x " : 0 , " y " : 8 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-17 16:27:38 -03:00
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
table_panel (
2025-12-12 18:00:43 -03:00
9 ,
2025-12-02 14:41:39 -03:00
" Top Namespaces " ,
2025-11-17 16:27:38 -03:00
' topk(10, sum(rate(container_network_transmit_bytes_total { namespace!= " " }[5m]) '
' + rate(container_network_receive_bytes_total { namespace!= " " }[5m])) by (namespace)) ' ,
2025-11-18 14:08:33 -03:00
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 16 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-17 16:27:38 -03:00
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
panels . append (
table_panel (
2025-12-12 18:00:43 -03:00
10 ,
2025-12-02 14:41:39 -03:00
" Top Pods " ,
2025-11-17 16:27:38 -03:00
' topk(10, sum(rate(container_network_transmit_bytes_total { pod!= " " }[5m]) '
' + rate(container_network_receive_bytes_total { pod!= " " }[5m])) by (namespace,pod)) ' ,
2025-11-18 14:08:33 -03:00
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 16 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-17 16:27:38 -03:00
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
panels . append (
timeseries_panel (
2025-12-12 18:00:43 -03:00
11 ,
2025-12-02 14:41:39 -03:00
" Traefik Routers (req/s) " ,
2025-11-17 18:55:11 -03:00
f " topk(10, { TRAEFIK_ROUTER_EXPR } ) " ,
2025-11-18 14:08:33 -03:00
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 25 } ,
2025-11-17 16:27:38 -03:00
unit = " req/s " ,
legend = " {{ router}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
2025-12-12 18:00:43 -03:00
12 ,
2025-12-02 14:41:39 -03:00
" Traefik Entrypoints (req/s) " ,
2025-11-17 16:27:38 -03:00
' sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m])) ' ,
2025-11-18 14:08:33 -03:00
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 25 } ,
2025-11-17 16:27:38 -03:00
unit = " req/s " ,
legend = " {{ entrypoint}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
return {
" uid " : " atlas-network " ,
" title " : " Atlas Network " ,
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " network " ] ,
}
2026-01-05 21:55:59 -03:00
def build_mail_dashboard ( ) :
panels = [ ]
bounce_rate_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 5 } ,
{ " color " : " orange " , " value " : 8 } ,
{ " color " : " red " , " value " : 10 } ,
] ,
}
2026-01-06 14:38:10 -03:00
limit_thresholds = {
2026-01-05 21:55:59 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
2026-01-06 14:38:10 -03:00
{ " color " : " yellow " , " value " : 70 } ,
{ " color " : " orange " , " value " : 85 } ,
{ " color " : " red " , " value " : 95 } ,
2026-01-05 21:55:59 -03:00
] ,
}
2026-01-06 14:38:10 -03:00
success_thresholds = {
2026-01-05 21:55:59 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
2026-01-06 14:38:10 -03:00
{ " color " : " orange " , " value " : 90 } ,
{ " color " : " yellow " , " value " : 95 } ,
{ " color " : " green " , " value " : 98 } ,
2026-01-06 02:06:20 -03:00
] ,
}
2026-01-05 21:55:59 -03:00
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
1 ,
2026-01-05 21:55:59 -03:00
" Sent (1d) " ,
2026-01-06 02:06:20 -03:00
' max(postmark_outbound_sent { window= " 1d " }) ' ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 0 } ,
2026-01-05 21:55:59 -03:00
decimals = 0 ,
)
)
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
2 ,
2026-01-05 21:55:59 -03:00
" Sent (7d) " ,
2026-01-06 02:06:20 -03:00
' max(postmark_outbound_sent { window= " 7d " }) ' ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 0 } ,
2026-01-05 21:55:59 -03:00
decimals = 0 ,
)
)
2026-01-06 14:38:10 -03:00
panels . append (
{
" id " : 3 ,
" type " : " stat " ,
" title " : " Mail Bounces (1d) " ,
" datasource " : PROM_DS ,
" gridPos " : { " h " : 4 , " w " : 6 , " x " : 12 , " y " : 0 } ,
" targets " : [
{
" expr " : ' max(postmark_outbound_bounce_rate { window= " 1d " }) ' ,
" refId " : " A " ,
" legendFormat " : " Rate " ,
} ,
{
" expr " : ' max(postmark_outbound_bounced { window= " 1d " }) ' ,
" refId " : " B " ,
" legendFormat " : " Count " ,
} ,
] ,
" fieldConfig " : {
" defaults " : {
" color " : { " mode " : " thresholds " } ,
" custom " : { " displayMode " : " auto " } ,
" thresholds " : bounce_rate_thresholds ,
" unit " : " none " ,
} ,
" overrides " : [
{
" matcher " : { " id " : " byName " , " options " : " Rate " } ,
" properties " : [ { " id " : " unit " , " value " : " percent " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : " Count " } ,
" properties " : [ { " id " : " unit " , " value " : " none " } ] ,
} ,
] ,
} ,
" options " : {
" colorMode " : " value " ,
" graphMode " : " area " ,
" justifyMode " : " center " ,
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" textMode " : " name_and_value " ,
} ,
}
)
2026-01-05 21:55:59 -03:00
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
4 ,
" Success Rate (1d) " ,
' clamp_min(100 - max(postmark_outbound_bounce_rate { window= " 1d " }), 0) ' ,
{ " h " : 4 , " w " : 6 , " x " : 18 , " y " : 0 } ,
unit = " percent " ,
thresholds = success_thresholds ,
decimals = 1 ,
)
)
panels . append (
stat_panel (
5 ,
2026-01-06 02:06:20 -03:00
" Limit Used (30d) " ,
" max(postmark_sending_limit_used_percent) " ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 4 } ,
2026-01-06 02:06:20 -03:00
thresholds = limit_thresholds ,
unit = " percent " ,
decimals = 1 ,
)
)
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
6 ,
2026-01-06 02:06:20 -03:00
" Send Limit (30d) " ,
" max(postmark_sending_limit) " ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 4 } ,
2026-01-05 21:55:59 -03:00
decimals = 0 ,
)
)
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
7 ,
2026-01-05 21:55:59 -03:00
" Last Success " ,
2026-01-06 02:06:20 -03:00
" max(postmark_last_success_timestamp_seconds) " ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 12 , " y " : 4 } ,
2026-01-05 21:55:59 -03:00
unit = " dateTimeAsIso " ,
decimals = 0 ,
)
)
2026-01-06 02:06:20 -03:00
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
8 ,
2026-01-06 02:06:20 -03:00
" Exporter Errors " ,
" sum(postmark_request_errors_total) " ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 18 , " y " : 4 } ,
2026-01-06 02:06:20 -03:00
decimals = 0 ,
)
)
2026-01-05 21:55:59 -03:00
panels . append (
timeseries_panel (
2026-01-06 02:06:20 -03:00
13 ,
2026-01-05 21:55:59 -03:00
" Bounce Rate (1d vs 7d) " ,
2026-01-06 02:06:20 -03:00
" max by (window) (postmark_outbound_bounce_rate) " ,
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 12 } ,
2026-01-05 21:55:59 -03:00
unit = " percent " ,
legend = " {{ window}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
2026-01-06 02:06:20 -03:00
14 ,
2026-01-05 21:55:59 -03:00
" Bounced (1d vs 7d) " ,
2026-01-06 02:06:20 -03:00
" max by (window) (postmark_outbound_bounced) " ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 12 } ,
2026-01-05 21:55:59 -03:00
unit = " none " ,
legend = " {{ window}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
2026-01-06 02:06:20 -03:00
15 ,
2026-01-05 21:55:59 -03:00
" Sent (1d vs 7d) " ,
2026-01-06 02:06:20 -03:00
" max by (window) (postmark_outbound_sent) " ,
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 20 } ,
2026-01-05 21:55:59 -03:00
unit = " none " ,
legend = " {{ window}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
2026-01-06 02:06:20 -03:00
16 ,
2026-01-05 21:55:59 -03:00
" Exporter Errors " ,
2026-01-06 02:06:20 -03:00
" sum(postmark_request_errors_total) " ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 20 } ,
2026-01-05 21:55:59 -03:00
unit = " none " ,
)
)
return {
" uid " : " atlas-mail " ,
" title " : " Atlas Mail " ,
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-30d " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " mail " ] ,
}
2026-01-21 13:37:36 -03:00
def build_jobs_dashboard ( ) :
2026-01-18 02:50:07 -03:00
panels = [ ]
2026-01-21 11:29:29 -03:00
age_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 6 } ,
{ " color " : " orange " , " value " : 24 } ,
{ " color " : " red " , " value " : 48 } ,
] ,
}
2026-04-12 20:05:39 -03:00
old_age_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 24 } ,
{ " color " : " orange " , " value " : 72 } ,
{ " color " : " red " , " value " : 168 } ,
] ,
}
2026-01-21 11:29:29 -03:00
recent_error_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " orange " , " value " : 1 } ,
{ " color " : " yellow " , " value " : 6 } ,
{ " color " : " green " , " value " : 24 } ,
] ,
}
2026-01-21 13:37:36 -03:00
task_error_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 3 } ,
{ " color " : " red " , " value " : 5 } ,
] ,
}
2026-04-12 20:05:39 -03:00
count_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
] ,
}
2026-04-12 17:29:18 -03:00
schedule_status_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " yellow " , " value " : 0.5 } ,
{ " color " : " green " , " value " : 1 } ,
] ,
}
2026-01-21 13:37:36 -03:00
2026-01-18 02:50:07 -03:00
panels . append (
2026-04-12 20:05:39 -03:00
stat_panel (
2026-01-18 02:50:07 -03:00
1 ,
2026-04-12 20:05:39 -03:00
" Schedule Metrics Exported " ,
ARIADNE_SCHEDULE_SIGNAL_COUNT ,
{ " h " : 4 , " w " : 4 , " x " : 0 , " y " : 0 } ,
unit = " none " ,
instant = True ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " green " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
2 ,
" Schedule Tasks Stale (>36h) " ,
ARIADNE_SCHEDULE_STALE_COUNT ,
{ " h " : 4 , " w " : 4 , " x " : 4 , " y " : 0 } ,
unit = " none " ,
thresholds = count_thresholds ,
)
)
panels . append (
stat_panel (
3 ,
" Schedule Tasks Missing Success " ,
ARIADNE_SCHEDULE_MISSING_SUCCESS_COUNT ,
{ " h " : 4 , " w " : 4 , " x " : 8 , " y " : 0 } ,
unit = " none " ,
thresholds = count_thresholds ,
)
)
panels . append (
stat_panel (
4 ,
" Schedule Tasks Failed Last Run " ,
ARIADNE_SCHEDULE_FAILED_LAST_COUNT ,
{ " h " : 4 , " w " : 4 , " x " : 12 , " y " : 0 } ,
unit = " none " ,
thresholds = count_thresholds ,
)
)
panels . append (
stat_panel (
5 ,
" Glue Jobs Stale (>36h) " ,
GLUE_STALE_COUNT ,
{ " h " : 4 , " w " : 4 , " x " : 16 , " y " : 0 } ,
unit = " none " ,
thresholds = count_thresholds ,
)
)
panels . append (
stat_panel (
6 ,
" Jenkins Workspace PV Backlog " ,
JENKINS_WORKSPACE_PV_STALE_COUNT ,
{ " h " : 4 , " w " : 4 , " x " : 20 , " y " : 0 } ,
2026-01-21 13:37:36 -03:00
unit = " none " ,
2026-04-12 20:05:39 -03:00
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 10 } ,
{ " color " : " red " , " value " : 25 } ,
] ,
} ,
)
)
schedule_list_panel = table_panel (
7 ,
" Ariadne Schedules: Last Success (h, newest first) " ,
2026-04-12 20:09:43 -03:00
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS_FALLBACK ,
2026-04-12 20:05:39 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 4 } ,
unit = " h " ,
transformations = [
{ " id " : " labelsToFields " , " options " : { } } ,
{ " id " : " sortBy " , " options " : { " fields " : [ " Value " ] , " order " : " asc " } } ,
] ,
instant = True ,
)
schedule_list_panel [ " description " ] = " Primary schedule inventory ordered by recency so fresh jobs stay at the top. "
panels . append ( schedule_list_panel )
panels . append (
bargauge_panel (
8 ,
" Ariadne Schedule Last Error (hours ago) " ,
2026-04-12 20:09:43 -03:00
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS_FALLBACK ,
2026-04-12 20:05:39 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 4 } ,
unit = " h " ,
2026-01-21 13:37:36 -03:00
instant = True ,
legend = " {{ task}} " ,
2026-04-12 20:05:39 -03:00
sort_order = " asc " ,
thresholds = recent_error_thresholds ,
decimals = 2 ,
2026-01-21 13:37:36 -03:00
)
)
2026-04-12 20:05:39 -03:00
status_panel = bargauge_panel (
9 ,
" Ariadne Schedule Last Status " ,
2026-04-12 20:09:43 -03:00
ARIADNE_SCHEDULE_LAST_STATUS_FALLBACK ,
2026-04-12 20:05:39 -03:00
{ " h " : 8 , " w " : 8 , " x " : 0 , " y " : 12 } ,
unit = " none " ,
instant = True ,
legend = " {{ task}} " ,
sort_order = " asc " ,
thresholds = schedule_status_thresholds ,
decimals = 0 ,
)
status_panel [ " description " ] = " 1 means the last run was ok. 0 means the last run ended in error. "
status_panel [ " fieldConfig " ] [ " defaults " ] [ " mappings " ] = [
{
" type " : " value " ,
" options " : {
2026-04-12 20:09:43 -03:00
" -1 " : { " text " : " pending " } ,
2026-04-12 20:05:39 -03:00
" 0 " : { " text " : " error " } ,
" 1 " : { " text " : " ok " } ,
} ,
}
]
panels . append ( status_panel )
schedule_runs_panel = bargauge_panel (
10 ,
" Ariadne Schedule Runs (range) " ,
2026-04-12 20:09:43 -03:00
ARIADNE_SCHEDULE_RUNS_RANGE_FALLBACK ,
2026-04-12 20:05:39 -03:00
{ " h " : 8 , " w " : 8 , " x " : 8 , " y " : 12 } ,
unit = " none " ,
instant = True ,
legend = " {{ task}} " ,
thresholds = { " mode " : " absolute " , " steps " : [ { " color " : " green " , " value " : None } ] } ,
)
schedule_runs_panel [ " description " ] = " Number of runs by schedule task over the selected dashboard time range. "
panels . append ( schedule_runs_panel )
schedule_errors_panel = bargauge_panel (
11 ,
" Ariadne Schedule Errors (range) " ,
2026-04-12 20:09:43 -03:00
ARIADNE_SCHEDULE_ERRORS_RANGE_FALLBACK ,
2026-04-12 20:05:39 -03:00
{ " h " : 8 , " w " : 8 , " x " : 16 , " y " : 12 } ,
unit = " none " ,
instant = True ,
legend = " {{ task}} " ,
thresholds = task_error_thresholds ,
)
schedule_errors_panel [ " description " ] = " Error run count by schedule task over the selected dashboard time range. "
panels . append ( schedule_errors_panel )
2026-01-21 13:37:36 -03:00
panels . append (
{
2026-04-12 20:05:39 -03:00
" id " : 12 ,
2026-01-21 13:37:36 -03:00
" type " : " timeseries " ,
2026-01-21 15:12:53 -03:00
" title " : " Ariadne Attempts / Failures " ,
2026-01-21 13:37:36 -03:00
" datasource " : PROM_DS ,
2026-04-12 20:05:39 -03:00
" gridPos " : { " h " : 7 , " w " : 12 , " x " : 0 , " y " : 20 } ,
2026-01-21 13:37:36 -03:00
" targets " : [
2026-01-21 14:30:55 -03:00
{ " expr " : ARIADNE_TASK_ATTEMPTS_SERIES , " refId " : " A " , " legendFormat " : " Attempts " } ,
2026-01-21 15:12:53 -03:00
{ " expr " : ARIADNE_TASK_FAILURES_SERIES , " refId " : " B " , " legendFormat " : " Failures " } ,
2026-01-21 13:37:36 -03:00
] ,
2026-01-21 14:30:55 -03:00
" fieldConfig " : {
" defaults " : { " unit " : " none " } ,
" overrides " : [
{
2026-01-21 15:12:53 -03:00
" matcher " : { " id " : " byName " , " options " : " Attempts " } ,
2026-01-21 14:30:55 -03:00
" properties " : [
2026-01-21 15:12:53 -03:00
{ " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " green " } }
2026-01-21 14:30:55 -03:00
] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : " Failures " } ,
" properties " : [
{ " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " red " } }
] ,
} ,
] ,
} ,
2026-01-21 13:37:36 -03:00
" options " : {
" legend " : { " displayMode " : " table " , " placement " : " right " } ,
" tooltip " : { " mode " : " multi " } ,
} ,
}
)
panels . append (
bargauge_panel (
2026-04-12 20:05:39 -03:00
13 ,
2026-01-21 13:37:36 -03:00
" One-off Job Pods (age hours) " ,
ONEOFF_JOB_POD_AGE_HOURS ,
2026-04-12 20:05:39 -03:00
{ " h " : 7 , " w " : 12 , " x " : 12 , " y " : 20 } ,
2026-01-21 13:37:36 -03:00
unit = " h " ,
instant = True ,
legend = " {{ namespace}}/ {{ pod}} " ,
thresholds = age_thresholds ,
limit = 12 ,
2026-01-21 14:30:55 -03:00
decimals = 2 ,
2026-01-21 13:37:36 -03:00
)
)
2026-01-21 11:29:29 -03:00
panels . append (
bargauge_panel (
2026-04-12 20:05:39 -03:00
14 ,
" Glue Jobs Last Success (hours ago) " ,
GLUE_LAST_SUCCESS_RANGE_HOURS ,
{ " h " : 6 , " w " : 12 , " x " : 0 , " y " : 27 } ,
2026-01-21 11:29:29 -03:00
unit = " h " ,
instant = True ,
2026-04-12 20:05:39 -03:00
legend = " {{ namespace}}/ {{ cronjob}} " ,
thresholds = age_thresholds ,
2026-01-21 14:30:55 -03:00
decimals = 2 ,
2026-01-21 11:29:29 -03:00
)
)
panels . append (
bargauge_panel (
2026-04-12 20:05:39 -03:00
15 ,
" Glue Jobs Last Schedule (hours ago) " ,
GLUE_LAST_SCHEDULE_RANGE_HOURS ,
{ " h " : 6 , " w " : 12 , " x " : 12 , " y " : 27 } ,
2026-01-19 16:58:02 -03:00
unit = " h " ,
instant = True ,
2026-04-12 20:05:39 -03:00
legend = " {{ namespace}}/ {{ cronjob}} " ,
2026-01-21 11:29:29 -03:00
thresholds = age_thresholds ,
2026-01-21 14:30:55 -03:00
decimals = 2 ,
2026-01-19 16:58:02 -03:00
)
)
panels . append (
2026-04-12 20:05:39 -03:00
stat_panel (
16 ,
" Jenkins Cleanup Signal Present " ,
JENKINS_CLEANUP_SIGNAL_COUNT ,
{ " h " : 4 , " w " : 4 , " x " : 0 , " y " : 33 } ,
unit = " none " ,
2026-04-12 20:38:04 -03:00
text_mode = " name_and_value " ,
legend = " Signal " ,
2026-04-12 20:05:39 -03:00
instant = True ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " green " , " value " : 1 } ,
] ,
} ,
2026-04-12 20:38:04 -03:00
orientation = " vertical " ,
wide_layout = True ,
graph_mode = " none " ,
justify_mode = " auto " ,
2026-04-12 20:05:39 -03:00
)
)
panels . append (
stat_panel (
17 ,
" Jenkins Cleanup Last Run Age (h) " ,
JENKINS_CLEANUP_LAST_RUN_AGE_HOURS ,
{ " h " : 4 , " w " : 4 , " x " : 4 , " y " : 33 } ,
2026-01-21 11:29:29 -03:00
unit = " h " ,
2026-04-12 20:05:39 -03:00
decimals = 2 ,
2026-04-12 20:38:04 -03:00
text_mode = " name_and_value " ,
legend = " Last Run " ,
2026-01-19 16:58:02 -03:00
instant = True ,
2026-01-21 11:29:29 -03:00
thresholds = age_thresholds ,
2026-04-12 20:38:04 -03:00
orientation = " vertical " ,
wide_layout = True ,
graph_mode = " none " ,
justify_mode = " auto " ,
2026-01-19 16:58:02 -03:00
)
)
2026-01-21 02:57:40 -03:00
panels . append (
2026-04-12 20:05:39 -03:00
stat_panel (
18 ,
" Jenkins Cleanup Last Success Age (h) " ,
JENKINS_CLEANUP_LAST_SUCCESS_AGE_HOURS ,
{ " h " : 4 , " w " : 4 , " x " : 8 , " y " : 33 } ,
2026-01-21 02:57:40 -03:00
unit = " h " ,
2026-04-12 20:05:39 -03:00
decimals = 2 ,
2026-04-12 20:38:04 -03:00
text_mode = " name_and_value " ,
legend = " Last Success " ,
2026-01-21 02:57:40 -03:00
instant = True ,
2026-01-21 11:29:29 -03:00
thresholds = age_thresholds ,
2026-04-12 20:38:04 -03:00
orientation = " vertical " ,
wide_layout = True ,
graph_mode = " none " ,
justify_mode = " auto " ,
2026-01-21 11:29:29 -03:00
)
)
panels . append (
2026-04-12 20:05:39 -03:00
stat_panel (
19 ,
" Jenkins Cleanup Planned (last run) " ,
JENKINS_CLEANUP_LAST_PLANNED ,
{ " h " : 4 , " w " : 4 , " x " : 12 , " y " : 33 } ,
2026-01-21 13:37:36 -03:00
unit = " none " ,
2026-04-12 20:38:04 -03:00
text_mode = " name_and_value " ,
legend = " Planned " ,
2026-01-21 13:37:36 -03:00
instant = True ,
2026-04-12 20:38:04 -03:00
orientation = " vertical " ,
wide_layout = True ,
graph_mode = " none " ,
justify_mode = " auto " ,
2026-01-21 13:37:36 -03:00
)
)
panels . append (
2026-04-12 20:05:39 -03:00
stat_panel (
20 ,
" Jenkins Cleanup Deleted (last run) " ,
JENKINS_CLEANUP_LAST_DELETED ,
{ " h " : 4 , " w " : 4 , " x " : 16 , " y " : 33 } ,
2026-01-21 13:37:36 -03:00
unit = " none " ,
2026-04-12 20:38:04 -03:00
text_mode = " name_and_value " ,
legend = " Deleted " ,
2026-01-21 13:37:36 -03:00
instant = True ,
2026-04-12 20:38:04 -03:00
orientation = " vertical " ,
wide_layout = True ,
graph_mode = " none " ,
justify_mode = " auto " ,
2026-01-21 13:37:36 -03:00
)
)
panels . append (
2026-04-12 20:05:39 -03:00
stat_panel (
21 ,
2026-01-21 11:29:29 -03:00
" Ariadne Access Requests " ,
ARIADNE_ACCESS_REQUESTS ,
2026-04-12 20:05:39 -03:00
{ " h " : 4 , " w " : 4 , " x " : 20 , " y " : 33 } ,
2026-01-21 11:29:29 -03:00
unit = " none " ,
instant = True ,
2026-01-21 02:57:40 -03:00
)
)
2026-04-12 20:05:39 -03:00
panels . append (
timeseries_panel (
22 ,
" Jenkins Cleanup Runs (range) " ,
None ,
{ " h " : 7 , " w " : 12 , " x " : 0 , " y " : 37 } ,
unit = " none " ,
targets = [
{ " refId " : " A " , " expr " : JENKINS_CLEANUP_RUNS_RANGE , " legendFormat " : " {{ mode}}/ {{ status}} " } ,
] ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
23 ,
" Jenkins Cleanup Objects (range) " ,
None ,
{ " h " : 7 , " w " : 12 , " x " : 12 , " y " : 37 } ,
unit = " none " ,
targets = [
{ " refId " : " A " , " expr " : JENKINS_CLEANUP_OBJECTS_RANGE , " legendFormat " : " {{ kind}}/ {{ action}}/ {{ mode}} " } ,
] ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
2026-04-13 00:25:33 -03:00
panels . append (
jenkins_weather_bargauge_panel (
24 ,
" Jenkins Build Weather (last run h, newest first) " ,
JENKINS_BUILD_WEATHER_LAST_RUN_AGE_HOURS ,
{ " h " : 8 , " w " : 8 , " x " : 0 , " y " : 44 } ,
unit = " h " ,
decimals = 2 ,
sort_order = " asc " ,
limit = 20 ,
thresholds = age_thresholds ,
description = (
" Jenkins homepage-style weather list: bars are color-coded by last build "
" status, icons mirror job weather, and values show age since the last run. "
) ,
)
)
panels . append (
jenkins_weather_bargauge_panel (
25 ,
" Jenkins Last Success (hours ago) " ,
JENKINS_BUILD_WEATHER_LAST_SUCCESS_AGE_HOURS ,
{ " h " : 8 , " w " : 8 , " x " : 8 , " y " : 44 } ,
unit = " h " ,
decimals = 2 ,
sort_order = " asc " ,
limit = 20 ,
thresholds = old_age_thresholds ,
description = " Per-job age since the most recent successful run. " ,
)
)
panels . append (
jenkins_weather_bargauge_panel (
26 ,
" Jenkins Last Duration (minutes) " ,
JENKINS_BUILD_WEATHER_LAST_DURATION_MINUTES ,
{ " h " : 8 , " w " : 8 , " x " : 16 , " y " : 44 } ,
unit = " m " ,
decimals = 2 ,
sort_order = " desc " ,
limit = 20 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 5 } ,
{ " color " : " orange " , " value " : 15 } ,
{ " color " : " red " , " value " : 30 } ,
] ,
} ,
description = " Most recent completed build duration per Jenkins job. " ,
)
)
2026-04-12 20:05:39 -03:00
stale_volume_panel = bargauge_panel (
2026-04-13 00:25:33 -03:00
27 ,
2026-04-12 20:05:39 -03:00
" Jenkins Workspace PV Age (h, detached only) " ,
JENKINS_WORKSPACE_PV_STALE_AGE_HOURS ,
2026-04-13 00:25:33 -03:00
{ " h " : 10 , " w " : 24 , " x " : 0 , " y " : 52 } ,
2026-04-12 20:05:39 -03:00
unit = " h " ,
instant = True ,
legend = " {{ name}} -> {{ persistentvolume}} " ,
thresholds = old_age_thresholds ,
decimals = 1 ,
limit = 40 ,
)
stale_volume_panel [ " description " ] = (
" Oldest detached Jenkins workspace volumes first. This is the direct cleanup backlog view. "
)
panels . append ( stale_volume_panel )
return {
" uid " : " atlas-jobs " ,
" title " : " Atlas Jobs " ,
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-7d " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " jobs " , " glue " ] ,
}
def build_testing_dashboard ( ) :
panels = [ ]
pass_rate_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " orange " , " value " : 80 } ,
{ " color " : " yellow " , " value " : 95 } ,
{ " color " : " green " , " value " : 99 } ,
] ,
}
failures_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 3 } ,
{ " color " : " red " , " value " : 5 } ,
] ,
}
2026-04-12 22:58:21 -03:00
smell_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " green " , " value " : 0 } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 3 } ,
{ " color " : " red " , " value " : 5 } ,
] ,
}
2026-04-12 20:05:39 -03:00
pass_rate_panel = stat_panel (
1 ,
2026-04-04 01:33:15 -03:00
" Platform Test Success Rate (30d) " ,
TEST_SUCCESS_RATE ,
2026-04-12 20:05:39 -03:00
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 0 } ,
2026-03-31 14:51:49 -03:00
unit = " percent " ,
2026-04-04 01:33:15 -03:00
decimals = 2 ,
2026-03-31 14:51:49 -03:00
instant = True ,
2026-04-12 20:05:39 -03:00
thresholds = pass_rate_thresholds ,
)
pass_rate_panel [ " description " ] = " Overall success rate across tracked suites over the last 30 days. "
panels . append ( pass_rate_panel )
failures_panel = stat_panel (
2 ,
" Platform Test Failures (24h) " ,
TEST_FAILURES_24H_TOTAL ,
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 0 } ,
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
2026-03-31 14:51:49 -03:00
)
2026-04-12 20:05:39 -03:00
failures_panel [ " description " ] = " Total failed runs in the last 24 hours. "
panels . append ( failures_panel )
activity_panel = table_panel (
3 ,
2026-04-04 01:33:15 -03:00
" Platform Test Activity (30d) " ,
PLATFORM_TEST_ACTIVITY_30D ,
2026-04-12 20:05:39 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 0 } ,
2026-03-31 14:51:49 -03:00
unit = " none " ,
2026-04-12 20:05:39 -03:00
transformations = [
{ " id " : " labelsToFields " , " options " : { } } ,
{ " id " : " sortBy " , " options " : { " fields " : [ " Value " ] , " order " : " desc " } } ,
] ,
2026-03-31 14:51:49 -03:00
instant = True ,
)
2026-04-12 20:05:39 -03:00
activity_panel [ " description " ] = " Suite/status event counts over 30 days. "
panels . append ( activity_panel )
panels . append (
bargauge_panel (
4 ,
" Platform Test Failures by Suite (24h) " ,
PLATFORM_TEST_FAILURES_24H_BY_SUITE ,
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 8 } ,
unit = " none " ,
instant = True ,
legend = " {{ suite}} " ,
thresholds = failures_thresholds ,
)
)
panels . append (
bargauge_panel (
5 ,
" Platform Test Success Rate by Suite (24h, lowest first) " ,
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 8 } ,
unit = " percent " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " asc " ,
thresholds = pass_rate_thresholds ,
decimals = 2 ,
)
2026-03-31 14:51:49 -03:00
)
2026-04-08 23:33:17 -03:00
suite_panel = timeseries_panel (
2026-04-12 20:05:39 -03:00
6 ,
2026-04-08 23:33:17 -03:00
" Platform Test Success Rate by Suite " ,
None ,
2026-04-12 20:05:39 -03:00
{ " h " : 8 , " w " : 24 , " x " : 0 , " y " : 16 } ,
2026-04-08 23:33:17 -03:00
unit = " percent " ,
2026-04-09 15:21:59 -03:00
targets = PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS ,
2026-04-08 23:33:17 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
)
2026-04-09 14:56:43 -03:00
suite_panel [ " fieldConfig " ] [ " defaults " ] [ " min " ] = 0
suite_panel [ " fieldConfig " ] [ " defaults " ] [ " max " ] = 100
2026-04-09 16:35:14 -03:00
suite_panel [ " fieldConfig " ] [ " defaults " ] [ " custom " ] = {
" drawStyle " : " line " ,
" lineInterpolation " : " linear " ,
" lineWidth " : 2 ,
" fillOpacity " : 10 ,
" showPoints " : " always " ,
" pointSize " : 4 ,
" spanNulls " : True ,
}
2026-04-12 20:05:39 -03:00
suite_panel [ " description " ] = " Trend line per suite. Flat gaps mean no runs in that interval. "
2026-04-08 23:33:17 -03:00
panels . append ( suite_panel )
2026-04-12 22:58:21 -03:00
coverage_panel = bargauge_panel (
7 ,
" Quality Gate Coverage by Suite ( % , gate 95) " ,
QUALITY_GATE_COVERAGE_BY_SUITE_WITH_MISSING ,
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 24 } ,
unit = " percent " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " asc " ,
thresholds = pass_rate_thresholds ,
decimals = 2 ,
)
coverage_panel [ " description " ] = (
" Latest reported per-suite line coverage. The quality gate target is 95 % . "
" A value of -1 means that suite has runs but no coverage metric published yet. "
)
coverage_panel [ " fieldConfig " ] [ " defaults " ] [ " mappings " ] = [
{
" type " : " value " ,
" options " : {
" -1 " : { " text " : " missing " } ,
} ,
}
]
panels . append ( coverage_panel )
coverage_gap_panel = bargauge_panel (
8 ,
" Coverage Gap to 95 % by Suite " ,
QUALITY_GATE_COVERAGE_GAP_BY_SUITE ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 24 } ,
unit = " percent " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 5 } ,
{ " color " : " red " , " value " : 10 } ,
] ,
} ,
decimals = 2 ,
)
coverage_gap_panel [ " description " ] = " How far each suite is below the 95 % target (0 means at or above target). "
panels . append ( coverage_gap_panel )
smell_panel = bargauge_panel (
9 ,
" Code Smell Infractions by Suite (files >500 LOC) " ,
QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE_WITH_MISSING ,
{ " h " : 8 , " w " : 24 , " x " : 0 , " y " : 32 } ,
unit = " none " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
thresholds = smell_thresholds ,
)
smell_panel [ " description " ] = (
" Per-suite count of files violating the 500-line hygiene/code-smell threshold. "
" A value of -1 means that suite has runs but no smell-infraction metric published yet. "
)
smell_panel [ " fieldConfig " ] [ " defaults " ] [ " mappings " ] = [
{
" type " : " value " ,
" options " : {
" -1 " : { " text " : " missing " } ,
} ,
}
]
panels . append ( smell_panel )
2026-01-18 02:50:07 -03:00
return {
2026-04-12 20:05:39 -03:00
" uid " : " atlas-testing " ,
" title " : " Atlas Testing " ,
2026-01-18 02:50:07 -03:00
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
2026-04-12 20:05:39 -03:00
" time " : { " from " : " now-30d " , " to " : " now " } ,
2026-01-18 02:50:07 -03:00
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
2026-04-12 20:05:39 -03:00
" tags " : [ " atlas " , " testing " , " quality " ] ,
2026-01-18 02:50:07 -03:00
}
2026-04-03 14:55:16 -03:00
def build_power_dashboard ( ) :
panels = [ ]
panels . append (
2026-04-12 19:56:12 -03:00
stat_panel (
2026-04-03 14:55:16 -03:00
1 ,
2026-04-03 20:45:40 -03:00
" UPS Current Load " ,
2026-04-12 22:53:23 -03:00
UPS_CURRENT_ROW_EXPR ,
2026-04-03 20:45:40 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 0 } ,
2026-04-12 19:56:12 -03:00
unit = " none " ,
decimals = 1 ,
text_mode = " name_and_value " ,
2026-04-12 22:53:23 -03:00
legend = " {{ ups}} {{ metric}} " ,
2026-04-12 19:46:39 -03:00
instant = True ,
2026-04-03 20:45:40 -03:00
field_overrides = [
2026-04-12 21:32:48 -03:00
{ " matcher " : { " id " : " byRegexp " , " options " : " .*Draw$ " } , " properties " : [ { " id " : " unit " , " value " : " watt " } ] } ,
{ " matcher " : { " id " : " byRegexp " , " options " : " .*Runtime$ " } , " properties " : [ { " id " : " unit " , " value " : " s " } ] } ,
2026-04-03 20:45:40 -03:00
] ,
2026-04-12 22:14:59 -03:00
orientation = " vertical " ,
2026-04-12 21:28:14 -03:00
wide_layout = False ,
2026-04-12 22:03:13 -03:00
title_size = 14 ,
value_size = 24 ,
2026-04-03 20:45:40 -03:00
description = (
2026-04-12 19:56:12 -03:00
" Per-UPS live snapshot: draw, discharge runtime, and charging/discharging status. "
2026-04-03 20:45:40 -03:00
) ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-03 20:45:40 -03:00
timeseries_panel (
2026-04-03 14:55:16 -03:00
2 ,
2026-04-03 20:45:40 -03:00
" UPS History (Power Draw) " ,
None ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 0 } ,
unit = " watt " ,
targets = [
2026-04-08 23:33:17 -03:00
{ " refId " : " A " , " expr " : ANANKE_UPS_DRAW_WATTS_DB_SERIES , " legendFormat " : ANANKE_UPS_DB_NAME } ,
{ " refId " : " B " , " expr " : ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES , " legendFormat " : ANANKE_UPS_TETHYS_NAME } ,
{ " refId " : " C " , " expr " : ANANKE_UPS_DRAW_WATTS_TOTAL_SERIES , " legendFormat " : " combined " } ,
2026-04-03 20:45:40 -03:00
] ,
legend_display = " table " ,
legend_placement = " right " ,
description = " Historical UPS power consumption in watts for titan-db, tethys, and combined load. " ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-12 19:56:12 -03:00
stat_panel (
2026-04-03 14:55:16 -03:00
3 ,
2026-04-03 20:45:40 -03:00
" Current Climate " ,
2026-04-12 22:53:23 -03:00
CLIMATE_CURRENT_ROW_EXPR ,
2026-04-03 17:49:09 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 8 } ,
2026-04-12 19:56:12 -03:00
unit = " none " ,
decimals = 2 ,
text_mode = " name_and_value " ,
2026-04-12 22:53:23 -03:00
legend = " {{ metric}} " ,
2026-04-12 19:46:39 -03:00
instant = True ,
2026-04-03 20:45:40 -03:00
field_overrides = [
2026-04-12 19:46:39 -03:00
{ " matcher " : { " id " : " byName " , " options " : " Temp °C " } , " properties " : [ { " id " : " unit " , " value " : " celsius " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Temp °F " } , " properties " : [ { " id " : " unit " , " value " : " fahrenheit " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Humidity " } , " properties " : [ { " id " : " unit " , " value " : " percent " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Pressure " } , " properties " : [ { " id " : " unit " , " value " : " suffix:kPa " } ] } ,
2026-04-03 20:45:40 -03:00
] ,
2026-04-12 20:11:37 -03:00
orientation = " vertical " ,
2026-04-12 21:28:14 -03:00
wide_layout = False ,
2026-04-12 22:03:13 -03:00
title_size = 16 ,
value_size = 28 ,
2026-04-12 19:56:12 -03:00
description = " Current tent values: Temp °C, Temp °F, Humidity, Pressure. " ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
timeseries_panel (
2026-04-03 20:45:40 -03:00
4 ,
" Climate History " ,
None ,
2026-04-03 17:49:09 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 8 } ,
2026-04-12 22:53:23 -03:00
unit = " none " ,
2026-04-03 20:45:40 -03:00
targets = [
{ " refId " : " A " , " expr " : CLIMATE_TEMP_SERIES , " legendFormat " : " Temperature (°C) " } ,
2026-04-12 17:56:54 -03:00
{ " refId " : " B " , " expr " : CLIMATE_TEMP_FAHRENHEIT_SERIES , " legendFormat " : " Temperature (°F) " } ,
{ " refId " : " C " , " expr " : CLIMATE_HUMIDITY_SERIES , " legendFormat " : " Humidity ( % ) " } ,
{ " refId " : " D " , " expr " : CLIMATE_PRESSURE_SERIES , " legendFormat " : " Pressure (VPD kPa) " } ,
2026-04-03 20:45:40 -03:00
] ,
field_overrides = [
2026-04-12 22:53:23 -03:00
{
" matcher " : { " id " : " byName " , " options " : " Temperature (°C) " } ,
" properties " : [
{ " id " : " unit " , " value " : " suffix:°C " } ,
{ " id " : " decimals " , " value " : 2 } ,
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
] ,
} ,
2026-04-12 17:56:54 -03:00
{
" matcher " : { " id " : " byName " , " options " : " Temperature (°F) " } ,
" properties " : [
2026-04-12 22:53:23 -03:00
{ " id " : " unit " , " value " : " suffix:°F " } ,
{ " id " : " decimals " , " value " : 2 } ,
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
2026-04-12 17:56:54 -03:00
] ,
} ,
2026-04-03 20:45:40 -03:00
{
2026-04-12 17:28:15 -03:00
" matcher " : { " id " : " byName " , " options " : " Humidity ( % ) " } ,
" properties " : [
2026-04-12 22:53:23 -03:00
{ " id " : " unit " , " value " : " suffix: % " } ,
{ " id " : " decimals " , " value " : 2 } ,
2026-04-12 17:56:54 -03:00
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
2026-04-12 22:53:23 -03:00
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
2026-04-12 17:28:15 -03:00
] ,
} ,
{
2026-04-12 17:56:54 -03:00
" matcher " : { " id " : " byName " , " options " : " Pressure (VPD kPa) " } ,
2026-04-03 20:45:40 -03:00
" properties " : [
2026-04-12 17:56:54 -03:00
{ " id " : " unit " , " value " : " suffix:kPa " } ,
2026-04-03 20:45:40 -03:00
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
{ " id " : " decimals " , " value " : 2 } ,
2026-04-12 22:53:23 -03:00
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
2026-04-03 20:45:40 -03:00
] ,
}
] ,
2026-04-03 14:55:16 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2026-04-12 17:56:54 -03:00
description = " Historical tent temperature (C/F), humidity, and pressure proxy (VPD kPa). " ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-12 19:56:12 -03:00
stat_panel (
2026-04-03 20:45:40 -03:00
5 ,
" Fan Activity " ,
2026-04-12 19:46:39 -03:00
CLIMATE_FAN_CURRENT_ROW_EXPR ,
2026-04-03 17:49:09 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 16 } ,
2026-04-12 19:56:12 -03:00
unit = " none " ,
decimals = 0 ,
text_mode = " name_and_value " ,
legend = " {{ metric}} " ,
2026-04-12 19:46:39 -03:00
instant = True ,
2026-04-12 19:56:12 -03:00
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 7 } ,
{ " color " : " red " , " value " : 9 } ,
] ,
} ,
2026-04-12 19:46:39 -03:00
field_overrides = [
{ " matcher " : { " id " : " byName " , " options " : " Outlet " } , " properties " : [ { " id " : " decimals " , " value " : 0 } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Inlet In " } , " properties " : [ { " id " : " decimals " , " value " : 0 } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Inlet Out " } , " properties " : [ { " id " : " decimals " , " value " : 0 } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Interior " } , " properties " : [ { " id " : " decimals " , " value " : 0 } ] } ,
] ,
2026-04-12 20:11:37 -03:00
orientation = " vertical " ,
2026-04-12 19:56:12 -03:00
wide_layout = False ,
description = " Current fan activity levels: outlet, inlet in, inlet out, interior (0-10). " ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-03 17:49:09 -03:00
timeseries_panel (
2026-04-03 20:45:40 -03:00
6 ,
" Fan History (0-10) " ,
None ,
2026-04-03 17:49:09 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 16 } ,
unit = " none " ,
2026-04-03 20:45:40 -03:00
max_value = 10 ,
targets = [
2026-04-09 20:10:52 -03:00
{ " refId " : " A " , " expr " : CLIMATE_FAN_OUTLET_SERIES , " legendFormat " : " Inside Outlet " } ,
{ " refId " : " B " , " expr " : CLIMATE_FAN_INSIDE_INLET_SERIES , " legendFormat " : " Inside Inlet " } ,
{ " refId " : " C " , " expr " : CLIMATE_FAN_OUTSIDE_INLET_SERIES , " legendFormat " : " Outside Inlet " } ,
{ " refId " : " D " , " expr " : CLIMATE_FAN_INTERIOR_SERIES , " legendFormat " : " Interior Fans " } ,
2026-04-03 20:45:40 -03:00
] ,
2026-04-03 17:49:09 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2026-04-03 20:45:40 -03:00
description = " Historical fan activity for all four fan groups (0-10 scale). " ,
2026-04-03 14:55:16 -03:00
)
)
return {
" uid " : " atlas-power " ,
" title " : " Atlas Power " ,
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-24h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " power " , " climate " ] ,
}
2025-12-02 13:16:00 -03:00
def build_gpu_dashboard ( ) :
panels = [ ]
2026-01-01 14:44:33 -03:00
gpu_scope = " $namespace_scope_gpu "
2025-12-02 13:16:00 -03:00
panels . append (
pie_panel (
1 ,
2025-12-02 14:41:39 -03:00
" Namespace GPU Share " ,
2026-01-01 14:44:33 -03:00
namespace_gpu_share_expr ( gpu_scope ) ,
2025-12-02 13:16:00 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 0 } ,
2026-01-01 14:44:33 -03:00
links = namespace_scope_links ( " namespace_scope_gpu " ) ,
2026-01-18 02:50:07 -03:00
description = " Shares are normalized within the selected filter. Switching scope changes the denominator. " ,
2025-12-02 13:16:00 -03:00
)
)
panels . append (
timeseries_panel (
2 ,
2025-12-02 14:41:39 -03:00
" GPU Util by Namespace " ,
2026-01-01 14:44:33 -03:00
namespace_gpu_usage_instant ( gpu_scope ) ,
2025-12-02 13:16:00 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 0 } ,
unit = " percent " ,
legend = " {{ namespace}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
3 ,
2025-12-02 14:41:39 -03:00
" GPU Util by Node " ,
2026-01-27 21:43:37 -03:00
gpu_util_by_hostname ( ) ,
2025-12-02 13:16:00 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 8 } ,
unit = " percent " ,
legend = " {{ Hostname}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
table_panel (
4 ,
2025-12-02 14:41:39 -03:00
" Top Pods by GPU Util " ,
2025-12-02 13:16:00 -03:00
' topk(10, sum(DCGM_FI_DEV_GPU_UTIL { pod!= " " }) by (namespace,pod,Hostname)) ' ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 8 } ,
unit = " percent " ,
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
return {
" uid " : " atlas-gpu " ,
" title " : " Atlas GPU " ,
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " gpu " ] ,
2026-01-01 14:44:33 -03:00
" templating " : {
" list " : [
namespace_scope_variable ( " namespace_scope_cpu " , " CPU namespace filter " ) ,
namespace_scope_variable ( " namespace_scope_gpu " , " GPU namespace filter " ) ,
namespace_scope_variable ( " namespace_scope_ram " , " RAM namespace filter " ) ,
]
} ,
2025-12-02 13:16:00 -03:00
}
2025-11-17 14:22:46 -03:00
DASHBOARDS = {
" atlas-overview " : {
" builder " : build_overview ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-overview.yaml " ,
} ,
" atlas-pods " : {
" builder " : build_pods_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-pods.yaml " ,
} ,
" atlas-nodes " : {
" builder " : build_nodes_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-nodes.yaml " ,
} ,
" atlas-storage " : {
" builder " : build_storage_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-storage.yaml " ,
} ,
2025-11-17 16:27:38 -03:00
" atlas-network " : {
" builder " : build_network_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-network.yaml " ,
} ,
2026-01-05 21:55:59 -03:00
" atlas-mail " : {
" builder " : build_mail_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-mail.yaml " ,
} ,
2026-01-21 13:37:36 -03:00
" atlas-jobs " : {
" builder " : build_jobs_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-jobs.yaml " ,
2026-01-18 02:50:07 -03:00
} ,
2026-04-12 20:05:39 -03:00
" atlas-testing " : {
" builder " : build_testing_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-testing.yaml " ,
} ,
2026-04-03 14:55:16 -03:00
" atlas-power " : {
" builder " : build_power_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-power.yaml " ,
} ,
2025-12-02 13:16:00 -03:00
" atlas-gpu " : {
" builder " : build_gpu_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-gpu.yaml " ,
} ,
2025-11-17 14:22:46 -03:00
}
2025-11-17 16:27:38 -03:00
def write_json ( uid , data ) :
2025-11-17 14:22:46 -03:00
DASHBOARD_DIR . mkdir ( parents = True , exist_ok = True )
path = DASHBOARD_DIR / f " { uid } .json "
path . write_text ( json . dumps ( data , indent = 2 ) + " \n " )
2025-11-17 16:27:38 -03:00
def render_configmap ( uid , info ) :
2025-11-17 14:22:46 -03:00
json_path = DASHBOARD_DIR / f " { uid } .json "
payload = json . dumps ( json . loads ( json_path . read_text ( ) ) , indent = 2 )
indented = " \n " . join ( " " + line for line in payload . splitlines ( ) )
2025-11-17 16:27:38 -03:00
output_path = info [ " configmap " ]
2025-11-17 14:22:46 -03:00
content = CONFIG_TEMPLATE . format (
relative_path = output_path . relative_to ( ROOT ) ,
name = output_path . stem ,
key = json_path . name ,
payload = indented ,
)
output_path . write_text ( content )
print ( f " Rendered { json_path . name } -> { output_path . relative_to ( ROOT ) } " )
def main ( ) :
parser = argparse . ArgumentParser ( description = __doc__ )
parser . add_argument ( " --build " , action = " store_true " , help = " Regenerate dashboard JSON files from builders " )
args = parser . parse_args ( )
if args . build :
for uid , info in DASHBOARDS . items ( ) :
write_json ( uid , info [ " builder " ] ( ) )
for uid , info in DASHBOARDS . items ( ) :
render_configmap ( uid , info )
if __name__ == " __main__ " :
main ( )