2025-11-17 14:22:46 -03:00
#!/usr/bin/env python3
2025-11-17 16:27:38 -03:00
""" Generate Atlas Grafana dashboards and render them into ConfigMaps.
2025-11-17 14:22:46 -03:00
Usage :
2025-12-02 13:16:00 -03:00
scripts / dashboards_render_atlas . py - - build # rebuild JSON + ConfigMaps
scripts / dashboards_render_atlas . py # re-render ConfigMaps from JSON
2025-11-17 14:22:46 -03:00
"""
2025-11-17 16:27:38 -03:00
2025-11-17 14:22:46 -03:00
import argparse
import json
import textwrap
2026-01-01 14:44:33 -03:00
import urllib . parse
2025-11-17 14:22:46 -03:00
from pathlib import Path
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Paths, folders, and shared metadata
# ---------------------------------------------------------------------------
2025-11-17 14:22:46 -03:00
ROOT = Path ( __file__ ) . resolve ( ) . parents [ 1 ]
DASHBOARD_DIR = ROOT / " services " / " monitoring " / " dashboards "
CONFIG_TEMPLATE = textwrap . dedent (
""" # {relative_path}
apiVersion : v1
kind : ConfigMap
metadata :
name : { name }
labels :
grafana_dashboard : " 1 "
data :
{ key } : |
{ payload }
"""
)
PROM_DS = { " type " : " prometheus " , " uid " : " atlas-vm " }
2025-12-02 14:41:39 -03:00
PUBLIC_FOLDER = " overview "
2025-11-17 16:27:38 -03:00
PRIVATE_FOLDER = " atlas-internal "
2026-04-11 11:54:43 -03:00
ASTRAIOS_MOUNTPOINT = " /mnt/astraios "
2025-11-17 16:27:38 -03:00
PERCENT_THRESHOLDS = {
2025-12-12 21:13:31 -03:00
" mode " : " absolute " ,
2025-11-17 16:27:38 -03:00
" steps " : [
{ " color " : " green " , " value " : None } ,
2025-12-12 21:13:31 -03:00
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 75 } ,
{ " color " : " red " , " value " : 91.5 } ,
2025-11-17 16:27:38 -03:00
] ,
}
2026-01-05 13:30:33 -03:00
NAMESPACE_CPU_WINDOW = " 1m "
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Cluster metadata
# ---------------------------------------------------------------------------
CONTROL_PLANE_NODES = [ " titan-0a " , " titan-0b " , " titan-0c " ]
2026-01-06 09:50:40 -03:00
CONTROL_DEPENDENCIES = [ " titan-db " , " titan-jh " ]
2025-11-17 16:27:38 -03:00
CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
WORKER_NODES = [
" titan-04 " ,
" titan-05 " ,
" titan-06 " ,
" titan-07 " ,
" titan-08 " ,
" titan-09 " ,
" titan-10 " ,
" titan-11 " ,
2026-01-11 02:02:47 -03:00
" titan-20 " ,
" titan-21 " ,
2025-11-17 16:27:38 -03:00
" titan-12 " ,
" titan-13 " ,
" titan-14 " ,
" titan-15 " ,
2026-01-21 14:30:55 -03:00
" titan-16 " ,
2025-11-17 16:27:38 -03:00
" titan-17 " ,
" titan-18 " ,
" titan-19 " ,
" titan-22 " ,
" titan-24 " ,
]
CONTROL_REGEX = " | " . join ( CONTROL_PLANE_NODES )
CONTROL_ALL_REGEX = " | " . join ( CONTROL_ALL )
WORKER_REGEX = " | " . join ( WORKER_NODES )
CONTROL_TOTAL = len ( CONTROL_PLANE_NODES )
WORKER_TOTAL = len ( WORKER_NODES )
CONTROL_SUFFIX = f " / { CONTROL_TOTAL } "
WORKER_SUFFIX = f " / { WORKER_TOTAL } "
2026-01-11 23:46:24 -03:00
# Namespaces considered infrastructure (excluded from workload counts)
2026-01-18 02:50:07 -03:00
INFRA_PATTERNS = [
" kube-.* " ,
" .*-system " ,
" traefik " ,
2026-01-11 23:46:24 -03:00
" monitoring " ,
2026-01-11 23:52:40 -03:00
" logging " ,
2026-01-12 00:26:46 -03:00
" cert-manager " ,
2026-01-11 23:52:40 -03:00
" maintenance " ,
" postgres " ,
2026-01-11 23:46:24 -03:00
]
2026-01-18 02:50:07 -03:00
INFRA_REGEX = f " ^( { ' | ' . join ( INFRA_PATTERNS ) } )$ "
2026-01-11 23:46:24 -03:00
# Namespaces allowed on control plane without counting as workloads
CP_ALLOWED_NS = INFRA_REGEX
2025-11-17 18:55:11 -03:00
LONGHORN_NODE_REGEX = " titan-1[2-9]|titan-2[24] "
2025-12-12 15:23:51 -03:00
GAUGE_WIDTHS = [ 4 , 3 , 3 , 4 , 3 , 3 , 4 ]
2025-11-18 17:09:13 -03:00
CONTROL_WORKLOADS_EXPR = (
f ' sum(kube_pod_info {{ node=~ " { CONTROL_REGEX } " ,namespace!~ " { CP_ALLOWED_NS } " }} ) or on() vector(0) '
)
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# PromQL helpers
# ---------------------------------------------------------------------------
NODE_INFO = ' label_replace(node_uname_info { nodename!= " " }, " node " , " $1 " , " nodename " , " (.*) " ) '
def node_filter ( regex ) :
""" Return a selector that evaluates to 1 for nodes matching the regex. """
return (
f ' label_replace(node_uname_info {{ nodename=~ " { regex } " }} , '
' " node " , " $1 " , " nodename " , " (.*) " ) '
)
2025-11-17 14:22:46 -03:00
2025-11-17 16:27:38 -03:00
def scoped_node_expr ( base , scope = " " ) :
""" Attach nodename metadata and optionally filter to a scope regex. """
expr = f " avg by (node) (( { base } ) * on(instance) group_left(node) { NODE_INFO } ) "
if scope :
expr = f " ( { expr } ) * on(node) group_left() { node_filter ( scope ) } "
return expr
def node_cpu_expr ( scope = " " ) :
idle = ' avg by (instance) (rate(node_cpu_seconds_total { mode= " idle " }[5m])) '
base = f " (1 - { idle } ) * 100 "
return scoped_node_expr ( base , scope )
def node_mem_expr ( scope = " " ) :
usage = (
" avg by (instance) ( "
" (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) "
" / node_memory_MemTotal_bytes * 100) "
)
return scoped_node_expr ( usage , scope )
def filesystem_usage_expr ( mount , scope = " " ) :
base = (
f ' avg by (instance) ( '
f ' (1 - (node_filesystem_avail_bytes {{ mountpoint= " { mount } " ,fstype!~ " tmpfs|overlay " }} '
f ' / node_filesystem_size_bytes {{ mountpoint= " { mount } " ,fstype!~ " tmpfs|overlay " }} )) * 100) '
)
return scoped_node_expr ( base , scope )
def root_usage_expr ( scope = " " ) :
return filesystem_usage_expr ( " / " , scope )
2026-04-11 11:54:43 -03:00
def astraios_usage_expr ( scope = " " ) :
return filesystem_usage_expr ( ASTRAIOS_MOUNTPOINT , scope )
2025-11-17 16:27:38 -03:00
def astreae_usage_expr ( mount ) :
return (
f " 100 - (sum(node_filesystem_avail_bytes {{ mountpoint= \" { mount } \" ,fstype!~ \" tmpfs|overlay \" }} ) / "
f " sum(node_filesystem_size_bytes {{ mountpoint= \" { mount } \" ,fstype!~ \" tmpfs|overlay \" }} ) * 100) "
)
def astreae_free_expr ( mount ) :
return f " sum(node_filesystem_avail_bytes {{ mountpoint= \" { mount } \" ,fstype!~ \" tmpfs|overlay \" }} ) "
2025-11-17 20:19:20 -03:00
def topk_with_node ( expr ) :
2025-11-17 23:42:55 -03:00
return f ' label_replace(topk(1, { expr } ), " __name__ " , " $1 " , " node " , " (.*) " ) '
2025-11-17 20:19:20 -03:00
2025-11-17 20:14:11 -03:00
def node_net_expr ( scope = " " ) :
base = (
' sum by (instance) ( '
2025-11-17 21:20:19 -03:00
' rate(node_network_receive_bytes_total { device!~ " lo " }[5m]) '
' + rate(node_network_transmit_bytes_total { device!~ " lo " }[5m])) '
2025-11-17 20:14:11 -03:00
)
return scoped_node_expr ( base , scope )
def node_io_expr ( scope = " " ) :
base = (
" sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
" + rate(node_disk_written_bytes_total[5m])) "
)
return scoped_node_expr ( base , scope )
2026-01-01 14:44:33 -03:00
def namespace_selector ( scope_var ) :
2026-01-05 13:30:33 -03:00
return f ' namespace!= " " ,pod!= " " ,container!= " " ,container!= " POD " , { scope_var } '
2026-01-01 14:44:33 -03:00
def namespace_gpu_selector ( scope_var ) :
return f ' namespace!= " " ,pod!= " " , { scope_var } '
def namespace_cpu_raw ( scope_var ) :
2026-01-05 13:30:33 -03:00
return (
" sum(rate(container_cpu_usage_seconds_total "
f " {{ { namespace_selector ( scope_var ) } }} [ { NAMESPACE_CPU_WINDOW } ])) by (namespace) "
)
2026-01-01 14:44:33 -03:00
def namespace_ram_raw ( scope_var ) :
return f " sum(container_memory_working_set_bytes {{ { namespace_selector ( scope_var ) } }} ) by (namespace) "
def namespace_gpu_usage_instant ( scope_var ) :
2026-01-27 21:43:37 -03:00
return gpu_usage_by_namespace ( scope_var )
2026-01-26 22:26:24 -03:00
def jetson_gpu_util_by_node ( ) :
return ' max by (node) (jetson_gr3d_freq_percent { node!= " " }) '
2026-01-27 21:43:37 -03:00
def dcgm_gpu_util_by_node ( ) :
dcgm_pod = ' label_replace(DCGM_FI_DEV_GPU_UTIL, " pod " , " $1 " , " Hostname " , " (.*) " ) '
dcgm_ns = ' label_replace( ' + dcgm_pod + ' , " namespace " , " monitoring " , " " , " " ) '
2026-01-27 16:19:30 -03:00
return (
2026-01-27 21:43:37 -03:00
" avg by (node) ( "
f " { dcgm_ns } * on(namespace,pod) group_left(node) "
' kube_pod_info { namespace= " monitoring " } '
" ) "
2026-01-27 16:19:30 -03:00
)
2026-01-27 21:43:37 -03:00
def gpu_util_by_node ( ) :
return f " { dcgm_gpu_util_by_node ( ) } or { jetson_gpu_util_by_node ( ) } "
def gpu_util_by_hostname ( ) :
return ' label_replace( ' + gpu_util_by_node ( ) + ' , " Hostname " , " $1 " , " node " , " (.*) " ) '
def gpu_node_labels ( ) :
return ' kube_node_labels { label_accelerator=~ " .+ " } or kube_node_labels { label_jetson= " true " } '
def gpu_requests_by_namespace_node ( scope_var ) :
2026-01-26 22:26:24 -03:00
return (
" sum by (namespace,node) ( "
f ' kube_pod_container_resource_requests {{ resource=~ " nvidia.com/gpu.* " , { scope_var } }} '
" * on(namespace,pod) group_left(node) kube_pod_info "
2026-01-27 21:46:58 -03:00
f " * on(node) group_left() ( { gpu_node_labels ( ) } ) "
2026-01-27 21:43:37 -03:00
" ) "
)
def gpu_usage_by_namespace ( scope_var ) :
requests_by_ns = gpu_requests_by_namespace_node ( scope_var )
total_by_node = f " sum by (node) ( { requests_by_ns } ) "
return (
" sum by (namespace) ( "
f " ( { requests_by_ns } ) / clamp_min( { total_by_node } , 1) "
2026-01-27 21:46:58 -03:00
f " * on(node) group_left() ( { gpu_util_by_node ( ) } ) "
2026-01-26 22:26:24 -03:00
" ) "
)
def jetson_gpu_usage_by_namespace ( scope_var ) :
requests_by_ns = jetson_gpu_requests ( scope_var )
total_by_node = f " sum by (node) ( { requests_by_ns } ) "
return (
" sum by (namespace) ( "
f " ( { requests_by_ns } ) / clamp_min( { total_by_node } , 1) "
f " * on(node) group_left() { jetson_gpu_util_by_node ( ) } "
" ) "
)
2026-01-01 14:44:33 -03:00
2025-11-18 14:08:33 -03:00
def namespace_share_expr ( resource_expr ) :
2026-01-01 14:16:08 -03:00
total = f " clamp_min(sum( { resource_expr } ), 1) "
return f " 100 * ( { resource_expr } ) / { total } "
2025-11-17 21:57:40 -03:00
2026-01-01 14:44:33 -03:00
def namespace_cpu_share_expr ( scope_var ) :
return namespace_share_expr ( namespace_cpu_raw ( scope_var ) )
2025-11-18 14:08:33 -03:00
2026-01-01 14:44:33 -03:00
def namespace_ram_share_expr ( scope_var ) :
return namespace_share_expr ( namespace_ram_raw ( scope_var ) )
2025-11-18 00:11:39 -03:00
2026-01-01 14:44:33 -03:00
def namespace_gpu_share_expr ( scope_var ) :
usage = namespace_gpu_usage_instant ( scope_var )
total = f " (sum( { usage } ) or on() vector(0)) "
share = f " 100 * ( { usage } ) / clamp_min( { total } , 1) "
2026-01-27 18:44:58 -03:00
idle = ' label_replace(vector(100), " namespace " , " idle " , " " , " " ) * scalar( ' + total + " == bool 0) "
2026-01-01 14:21:43 -03:00
return f " ( { share } ) or ( { idle } ) "
2025-11-17 23:12:16 -03:00
2025-12-12 20:30:00 -03:00
PROBLEM_PODS_EXPR = (
' sum(max by (namespace,pod) (kube_pod_status_phase { phase!~ " Running|Succeeded " })) '
" or on() vector(0) "
)
2025-11-17 16:27:38 -03:00
CRASHLOOP_EXPR = (
' sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason '
2025-12-12 20:30:00 -03:00
' { reason=~ " CrashLoopBackOff|ImagePullBackOff " })) '
" or on() vector(0) "
2025-11-17 16:27:38 -03:00
)
STUCK_TERMINATING_EXPR = (
2025-11-17 18:55:11 -03:00
' sum(max by (namespace,pod) ( '
' ((time() - kube_pod_deletion_timestamp { pod!= " " }) > bool 600) '
' and on(namespace,pod) (kube_pod_deletion_timestamp { pod!= " " } > bool 0) '
2025-12-12 20:30:00 -03:00
' )) '
" or on() vector(0) "
2025-11-17 16:27:38 -03:00
)
2025-12-19 13:46:34 -03:00
UPTIME_WINDOW = " 365d "
2025-12-19 14:56:29 -03:00
# Keep the subquery step coarse so we don't request an excessive number of points.
UPTIME_STEP = " 1h "
2025-12-12 15:56:33 -03:00
TRAEFIK_READY_EXPR = (
" ( "
' sum(kube_deployment_status_replicas_available { namespace=~ " traefik|kube-system " ,deployment= " traefik " }) '
" / clamp_min( "
' sum(kube_deployment_spec_replicas { namespace=~ " traefik|kube-system " ,deployment= " traefik " }), 1) '
" ) "
)
CONTROL_READY_FRACTION_EXPR = (
f " (sum(kube_node_status_condition {{ condition= \" Ready \" ,status= \" true \" ,node=~ \" { CONTROL_REGEX } \" }} ) "
f " / { CONTROL_TOTAL } ) "
)
UPTIME_AVAIL_EXPR = (
f " min(( { CONTROL_READY_FRACTION_EXPR } ), ( { TRAEFIK_READY_EXPR } )) "
)
2025-12-13 15:51:45 -03:00
# Tie-breaker to deterministically pick one node per namespace when shares tie.
NODE_TIEBREAKER = " + " . join (
f " ( { node_filter ( node ) } ) * 1e-6 * { idx } "
for idx , node in enumerate ( CONTROL_ALL + WORKER_NODES , start = 1 )
)
2025-12-19 14:56:29 -03:00
UPTIME_AVG_EXPR = f " avg_over_time(( { UPTIME_AVAIL_EXPR } )[ { UPTIME_WINDOW } : { UPTIME_STEP } ]) "
2025-12-12 16:36:47 -03:00
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
2025-12-12 15:56:33 -03:00
UPTIME_NINES_EXPR = f " -log10(1 - clamp_max( { UPTIME_AVG_EXPR } , 0.999999999)) "
2025-12-12 15:23:51 -03:00
UPTIME_THRESHOLDS = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " yellow " , " value " : 3 } ,
{ " color " : " green " , " value " : 3.5 } ,
] ,
}
2025-12-12 16:11:28 -03:00
UPTIME_PERCENT_THRESHOLDS = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
2025-12-15 22:14:26 -03:00
{ " color " : " orange " , " value " : 0.99 } ,
{ " color " : " yellow " , " value " : 0.999 } ,
{ " color " : " green " , " value " : 0.9999 } ,
{ " color " : " blue " , " value " : 0.99999 } ,
2025-12-12 16:11:28 -03:00
] ,
}
2025-11-17 16:27:38 -03:00
PROBLEM_TABLE_EXPR = (
" (time() - kube_pod_created { pod!= \" \" }) "
" * on(namespace,pod) group_left(node) kube_pod_info "
" * on(namespace,pod) group_left(phase) "
" max by (namespace,pod,phase) (kube_pod_status_phase { phase!~ \" Running|Succeeded \" }) "
)
CRASHLOOP_TABLE_EXPR = (
" (time() - kube_pod_created { pod!= \" \" }) "
" * on(namespace,pod) group_left(node) kube_pod_info "
" * on(namespace,pod,container) group_left(reason) "
" max by (namespace,pod,container,reason) "
" (kube_pod_container_status_waiting_reason { reason=~ \" CrashLoopBackOff|ImagePullBackOff \" }) "
)
STUCK_TABLE_EXPR = (
2025-11-17 18:55:11 -03:00
" ( "
2025-11-17 16:27:38 -03:00
" ((time() - kube_pod_deletion_timestamp { pod!= \" \" }) "
2025-11-17 18:55:11 -03:00
" and on(namespace,pod) (kube_pod_deletion_timestamp { pod!= \" \" } > bool 0)) "
" * on(namespace,pod) group_left(node) kube_pod_info "
" ) "
2025-11-17 16:27:38 -03:00
)
2026-01-11 23:46:24 -03:00
NAMESPACE_SCOPE_WORKLOAD = f ' namespace!~ " { INFRA_REGEX } " '
2026-01-01 14:16:08 -03:00
NAMESPACE_SCOPE_ALL = ' namespace=~ " .* " '
2026-01-11 23:46:24 -03:00
NAMESPACE_SCOPE_INFRA = f ' namespace=~ " { INFRA_REGEX } " '
2026-01-01 14:44:33 -03:00
NAMESPACE_SCOPE_VARS = [ " namespace_scope_cpu " , " namespace_scope_gpu " , " namespace_scope_ram " ]
2026-04-19 14:18:41 -03:00
def promql_task_regex ( tasks ) :
""" Return a PromQL-safe regex alternation for the provided task names. """
return " | " . join ( tasks )
ARIADNE_ALL_SCHEDULE_TASKS = [
" schedule.mailu_sync " ,
" schedule.nextcloud_sync " ,
" schedule.nextcloud_cron " ,
" schedule.nextcloud_maintenance " ,
" schedule.vaultwarden_sync " ,
" schedule.wger_user_sync " ,
" schedule.wger_admin " ,
" schedule.firefly_user_sync " ,
" schedule.firefly_cron " ,
" schedule.vault_k8s_auth " ,
" schedule.vault_oidc " ,
" schedule.comms_guest_name " ,
" schedule.comms_pin_invite " ,
" schedule.comms_reset_room " ,
" schedule.comms_seed_room " ,
" schedule.pod_cleaner " ,
" schedule.opensearch_prune " ,
" schedule.image_sweeper " ,
" schedule.metis_k3s_token_sync " ,
" schedule.platform_quality_suite_probe " ,
]
ARIADNE_FAST_SCHEDULE_TASKS = [
task
for task in ARIADNE_ALL_SCHEDULE_TASKS
if task not in { " schedule.comms_pin_invite " , " schedule.comms_reset_room " }
]
ARIADNE_SCHEDULE_HEALTH_TASKS = [
" schedule.nextcloud_sync " ,
" schedule.nextcloud_cron " ,
" schedule.vaultwarden_sync " ,
" schedule.wger_user_sync " ,
" schedule.firefly_user_sync " ,
" schedule.comms_guest_name " ,
" schedule.comms_seed_room " ,
" schedule.pod_cleaner " ,
" schedule.image_sweeper " ,
" schedule.metis_k3s_token_sync " ,
" schedule.platform_quality_suite_probe " ,
]
ARIADNE_ALL_SCHEDULE_FILTER = f ' task=~ " ^( { promql_task_regex ( ARIADNE_ALL_SCHEDULE_TASKS ) } )$ " '
ARIADNE_FAST_SCHEDULE_FILTER = f ' task=~ " ^( { promql_task_regex ( ARIADNE_FAST_SCHEDULE_TASKS ) } )$ " '
ARIADNE_SCHEDULE_HEALTH_FILTER = f ' task=~ " ^( { promql_task_regex ( ARIADNE_SCHEDULE_HEALTH_TASKS ) } )$ " '
ARIADNE_ALL_SCHEDULE_NEXT_RUN = f " ariadne_schedule_next_run_timestamp_seconds {{ { ARIADNE_ALL_SCHEDULE_FILTER } }} "
ARIADNE_ALL_SCHEDULE_LAST_SUCCESS = (
f " ariadne_schedule_last_success_timestamp_seconds {{ { ARIADNE_ALL_SCHEDULE_FILTER } }} "
)
ARIADNE_ALL_SCHEDULE_LAST_ERROR = f " ariadne_schedule_last_error_timestamp_seconds {{ { ARIADNE_ALL_SCHEDULE_FILTER } }} "
ARIADNE_ALL_SCHEDULE_LAST_STATUS = f " ariadne_schedule_last_status {{ { ARIADNE_ALL_SCHEDULE_FILTER } }} "
ARIADNE_FAST_SCHEDULE_LAST_SUCCESS = (
f " ariadne_schedule_last_success_timestamp_seconds {{ { ARIADNE_FAST_SCHEDULE_FILTER } }} "
)
ARIADNE_FAST_SCHEDULE_LAST_ERROR = f " ariadne_schedule_last_error_timestamp_seconds {{ { ARIADNE_FAST_SCHEDULE_FILTER } }} "
ARIADNE_FAST_SCHEDULE_LAST_STATUS = f " ariadne_schedule_last_status {{ { ARIADNE_FAST_SCHEDULE_FILTER } }} "
ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS = (
f " ariadne_schedule_last_success_timestamp_seconds {{ { ARIADNE_SCHEDULE_HEALTH_FILTER } }} "
)
ARIADNE_HEALTH_SCHEDULE_LAST_STATUS = f " ariadne_schedule_last_status {{ { ARIADNE_SCHEDULE_HEALTH_FILTER } }} "
ARIADNE_SCHEDULE_LAST_SUCCESS_AGE = f " (time() - { ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS } ) "
ARIADNE_SCHEDULE_LAST_ERROR_AGE = f " (time() - { ARIADNE_ALL_SCHEDULE_LAST_ERROR } ) "
ARIADNE_SCHEDULE_LAST_SUCCESS_AGE_HOURS = f " ( { ARIADNE_SCHEDULE_LAST_SUCCESS_AGE } ) / 3600 "
ARIADNE_SCHEDULE_LAST_ERROR_AGE_HOURS = f " ( { ARIADNE_SCHEDULE_LAST_ERROR_AGE } ) / 3600 "
ARIADNE_SCHEDULE_STALE_WINDOW_SEC = 36 * 3600
ARIADNE_SCHEDULE_STALE = f " (( { ARIADNE_SCHEDULE_LAST_SUCCESS_AGE } ) > bool { ARIADNE_SCHEDULE_STALE_WINDOW_SEC } ) "
ARIADNE_SCHEDULE_MISSING = (
f " ( { ARIADNE_ALL_SCHEDULE_NEXT_RUN } unless on(task) { ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS } ) "
)
ARIADNE_SCHEDULE_FAILED = f " ((1 - { ARIADNE_HEALTH_SCHEDULE_LAST_STATUS } ) > bool 0) "
ARIADNE_SCHEDULE_STALE_COUNT = f " sum( { ARIADNE_SCHEDULE_STALE } ) or on() vector(0) "
ARIADNE_SCHEDULE_MISSING_COUNT = f " count( { ARIADNE_SCHEDULE_MISSING } ) or on() vector(0) "
ARIADNE_SCHEDULE_FAILED_COUNT = f " sum( { ARIADNE_SCHEDULE_FAILED } ) or on() vector(0) "
2026-01-21 14:30:55 -03:00
ARIADNE_TASK_ERRORS_RANGE = ' sum by (task) (increase(ariadne_task_runs_total { status= " error " }[$__range])) '
2026-01-19 16:58:02 -03:00
ARIADNE_TASK_ERRORS_24H = ' sum by (task) (increase(ariadne_task_runs_total { status= " error " }[24h])) '
2026-01-21 13:37:36 -03:00
ARIADNE_TASK_ERRORS_1H = ' sum by (task) (increase(ariadne_task_runs_total { status= " error " }[1h])) '
ARIADNE_TASK_ERRORS_30D = ' sum by (task) (increase(ariadne_task_runs_total { status= " error " }[30d])) '
2026-01-19 16:58:02 -03:00
ARIADNE_TASK_SUCCESS_24H = ' sum by (task) (increase(ariadne_task_runs_total { status= " ok " }[24h])) '
2026-01-21 02:57:40 -03:00
ARIADNE_TASK_RUNS_BY_STATUS_1H = ' sum by (status) (increase(ariadne_task_runs_total[1h])) '
2026-01-21 11:29:29 -03:00
ARIADNE_TASK_ERRORS_1H_TOTAL = ' sum(increase(ariadne_task_runs_total { status= " error " }[1h])) '
ARIADNE_TASK_ERRORS_24H_TOTAL = ' sum(increase(ariadne_task_runs_total { status= " error " }[24h])) '
ARIADNE_TASK_RUNS_1H_TOTAL = ' sum(increase(ariadne_task_runs_total[1h])) '
2026-04-19 14:18:41 -03:00
ARIADNE_TASK_ATTEMPTS_SERIES = ' sum(increase(ariadne_task_runs_total[$__interval])) '
ARIADNE_TASK_FAILURES_SERIES = ' sum(increase(ariadne_task_runs_total { status= " error " }[$__interval])) '
2026-01-21 14:30:55 -03:00
ARIADNE_TASK_WARNINGS_SERIES = (
' sum(increase(ariadne_task_runs_total { status!~ " ok|error " }[$__interval])) or on() vector(0) '
)
2026-04-19 14:18:41 -03:00
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = f " (time() - { ARIADNE_ALL_SCHEDULE_LAST_SUCCESS } ) / 3600 "
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = f " (time() - { ARIADNE_ALL_SCHEDULE_LAST_ERROR } ) / 3600 "
2026-01-21 14:30:55 -03:00
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
2026-04-19 14:18:41 -03:00
f " (time() - max_over_time( { ARIADNE_ALL_SCHEDULE_LAST_SUCCESS } [$__range])) / 3600 "
2026-01-21 14:30:55 -03:00
)
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
2026-04-19 14:18:41 -03:00
f " (time() - max_over_time( { ARIADNE_ALL_SCHEDULE_LAST_ERROR } [$__range])) / 3600 "
2026-04-13 14:29:44 -03:00
)
2026-04-19 14:18:41 -03:00
ARIADNE_FAST_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
f " (time() - max_over_time( { ARIADNE_FAST_SCHEDULE_LAST_SUCCESS } [$__range])) / 3600 "
2026-04-12 20:05:39 -03:00
)
2026-04-19 14:18:41 -03:00
ARIADNE_FAST_SCHEDULE_NEXT_RUN_HOURS = f " (( { ARIADNE_ALL_SCHEDULE_NEXT_RUN } - time()) / 3600) "
2026-01-19 16:58:02 -03:00
ARIADNE_ACCESS_REQUESTS = " ariadne_access_requests_total "
2026-04-10 15:35:20 -03:00
PLATFORM_TEST_SUITE_NAMES = [
" ariadne " ,
" metis " ,
" ananke " ,
" atlasbot " ,
" pegasus " ,
" soteria " ,
2026-04-19 14:18:41 -03:00
" titan_iac " ,
" bstein_home " ,
" data_prepper " ,
2026-04-10 15:35:20 -03:00
]
2026-04-18 17:47:06 -03:00
PLATFORM_TEST_SUCCESS_STATUS = " ok|passed|success "
2026-04-19 14:18:41 -03:00
PLATFORM_TEST_CI_JOB = " platform-quality-ci "
PLATFORM_TEST_EXPORT_FILTER = f ' exported_job= " { PLATFORM_TEST_CI_JOB } " '
2026-04-18 17:47:06 -03:00
PLATFORM_TEST_SUITE_VALUE_BY_NAME = {
" ariadne " : " ariadne " ,
" metis " : " metis " ,
" ananke " : " ananke " ,
" atlasbot " : " atlasbot " ,
" pegasus " : " pegasus|pegasus-health|pegasus_health " ,
" soteria " : " soteria " ,
2026-04-19 14:18:41 -03:00
" titan_iac " : " titan_iac|titan-iac " ,
" bstein_home " : " bstein_home|bstein-home " ,
" data_prepper " : " data_prepper|data-prepper " ,
2026-04-18 17:47:06 -03:00
}
2026-04-20 13:45:01 -03:00
PLATFORM_TEST_JENKINS_JOB_BY_SUITE = {
" ariadne " : " ariadne " ,
" metis " : " metis " ,
" ananke " : " ananke " ,
" atlasbot " : " atlasbot " ,
" pegasus " : " pegasus " ,
" soteria " : " Soteria " ,
" titan_iac " : " titan-iac " ,
" bstein_home " : " bstein-dev-home " ,
" data_prepper " : " data-prepper " ,
}
JENKINS_UI_BASE_DEFAULT = " https://ci.bstein.dev "
2026-04-18 17:47:06 -03:00
PLATFORM_TEST_SUITE_MATCHER = " | " . join (
PLATFORM_TEST_SUITE_VALUE_BY_NAME . get ( suite , suite ) for suite in PLATFORM_TEST_SUITE_NAMES
)
2026-04-19 14:18:41 -03:00
PLATFORM_TEST_SUITE_CANONICAL_MATCHER = " | " . join ( PLATFORM_TEST_SUITE_NAMES )
2026-04-19 23:22:34 -03:00
PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER = PLATFORM_TEST_SUITE_MATCHER
2026-04-04 01:33:15 -03:00
PLATFORM_TEST_SUCCESS_EVENTS_30D = (
2026-04-19 14:18:41 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status=~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} [30d])) or on() vector(0)) '
2026-04-04 01:33:15 -03:00
)
PLATFORM_TEST_TOTAL_EVENTS_30D = (
2026-04-19 14:18:41 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [30d])) or on() vector(0)) '
)
PLATFORM_TEST_SUCCESS_EVENTS_7D = (
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status=~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} [7d])) or on() vector(0)) '
)
PLATFORM_TEST_TOTAL_EVENTS_7D = (
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [7d])) or on() vector(0)) '
)
PLATFORM_TEST_SUCCESS_EVENTS_24H = (
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status=~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h])) or on() vector(0)) '
)
PLATFORM_TEST_TOTAL_EVENTS_24H = (
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h])) or on() vector(0)) '
2026-04-04 01:33:15 -03:00
)
2026-03-31 13:54:04 -03:00
TEST_SUCCESS_RATE = (
2026-04-04 01:33:15 -03:00
f " 100 * ( { PLATFORM_TEST_SUCCESS_EVENTS_30D } ) / clamp_min(( { PLATFORM_TEST_TOTAL_EVENTS_30D } ), 1) "
2026-01-21 13:37:36 -03:00
)
2026-04-19 14:18:41 -03:00
TEST_SUCCESS_RATE_7D = (
f " 100 * ( { PLATFORM_TEST_SUCCESS_EVENTS_7D } ) / clamp_min(( { PLATFORM_TEST_TOTAL_EVENTS_7D } ), 1) "
)
TEST_SUCCESS_RATE_24H = (
f " 100 * ( { PLATFORM_TEST_SUCCESS_EVENTS_24H } ) / clamp_min(( { PLATFORM_TEST_TOTAL_EVENTS_24H } ), 1) "
)
2026-04-04 01:33:15 -03:00
TEST_FAILURES_24H_TOTAL = (
2026-04-19 14:18:41 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status!~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h])) or on() vector(0)) '
2026-04-04 01:33:15 -03:00
)
2026-04-09 19:27:48 -03:00
PLATFORM_TEST_FAILURES_24H_BY_SUITE = (
2026-04-19 14:18:41 -03:00
f ' sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status!~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h]))) '
2026-04-09 19:27:48 -03:00
)
2026-04-04 01:33:15 -03:00
PLATFORM_TEST_ACTIVITY_30D = (
2026-04-19 14:18:41 -03:00
f ' sum by (suite, status) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [30d])) '
)
PLATFORM_TEST_RUNS_24H_TOTAL = PLATFORM_TEST_TOTAL_EVENTS_24H
PLATFORM_TEST_ACTIVE_SUITES_24H = (
f ' sum((sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h])) > 0)) '
" or on() vector(0) "
2026-01-21 13:37:36 -03:00
)
2026-04-09 19:27:48 -03:00
PLATFORM_TEST_POINT_WINDOW = " 1h "
2026-04-10 15:35:20 -03:00
PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS = [
2026-04-09 16:16:35 -03:00
{
2026-04-10 15:35:20 -03:00
" refId " : chr ( ord ( " A " ) + index ) ,
2026-04-09 16:16:35 -03:00
" expr " : (
2026-04-19 14:18:41 -03:00
f ' (100 * (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_VALUE_BY_NAME . get ( suite , suite ) } " ,status=~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} '
2026-04-09 16:35:14 -03:00
f ' [ { PLATFORM_TEST_POINT_WINDOW } ]))) / '
2026-04-19 14:18:41 -03:00
f ' clamp_min((sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_VALUE_BY_NAME . get ( suite , suite ) } " , { PLATFORM_TEST_EXPORT_FILTER } }} [ { PLATFORM_TEST_POINT_WINDOW } ]))), 1)) '
2026-04-09 16:16:35 -03:00
) ,
" legendFormat " : suite ,
}
2026-04-10 15:35:20 -03:00
for index , suite in enumerate ( PLATFORM_TEST_SUITE_NAMES )
2026-04-09 16:16:35 -03:00
]
2026-04-09 20:16:44 -03:00
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
2026-04-19 14:18:41 -03:00
f ' sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status=~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h]))) '
f ' / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h]))), 1)) '
2026-04-09 20:16:44 -03:00
)
2026-04-12 22:58:21 -03:00
QUALITY_GATE_SUITE_INDEX_30D = (
2026-04-19 14:18:41 -03:00
f ' sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [30d])) '
2026-04-12 22:58:21 -03:00
)
QUALITY_GATE_COVERAGE_BY_SUITE = (
2026-04-19 14:18:41 -03:00
f ' (max by (suite) ( {{ __name__=~ " .*_quality_gate_coverage_percent " , { PLATFORM_TEST_EXPORT_FILTER } }} )) '
f ' or on(suite) (max by (suite) (platform_quality_gate_workspace_line_coverage_percent {{ { PLATFORM_TEST_EXPORT_FILTER } }} )) '
2026-04-12 22:58:21 -03:00
)
QUALITY_GATE_COVERAGE_BY_SUITE_WITH_MISSING = (
f " ( { QUALITY_GATE_COVERAGE_BY_SUITE } ) or on(suite) (0 * ( { QUALITY_GATE_SUITE_INDEX_30D } ) - 1) "
)
QUALITY_GATE_COVERAGE_GAP_BY_SUITE = (
f " clamp_min(95 - ( { QUALITY_GATE_COVERAGE_BY_SUITE } ), 0) "
)
QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE = (
2026-04-19 14:18:41 -03:00
f " max by (suite) (platform_quality_gate_source_lines_over_500_total {{ { PLATFORM_TEST_EXPORT_FILTER } }} ) "
2026-04-12 22:58:21 -03:00
)
QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE_WITH_MISSING = (
f " ( { QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE } ) or on(suite) (0 * ( { QUALITY_GATE_SUITE_INDEX_30D } ) - 1) "
)
2026-04-19 14:18:41 -03:00
PVC_BACKUP_AGE_HOURS_BY_PVC = " sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999))) "
2026-04-08 23:33:17 -03:00
ANANKE_SELECTOR = ' job= " ananke-power " '
ANANKE_UPS_DB_NAME = " Pyrphoros "
ANANKE_UPS_DB_NODE = " titan-db "
ANANKE_UPS_TETHYS_NAME = " Statera "
ANANKE_UPS_TETHYS_NODE = " titan-24 "
ANANKE_UPS_DB_SELECTOR = f ' { ANANKE_SELECTOR } ,source= " { ANANKE_UPS_DB_NAME } " '
ANANKE_UPS_TETHYS_SELECTOR = f ' { ANANKE_SELECTOR } ,source= " { ANANKE_UPS_TETHYS_NAME } " '
ANANKE_UPS_ON_BATTERY = f " sum(ananke_ups_on_battery {{ { ANANKE_SELECTOR } }} ) or on() vector(0) "
ANANKE_UPS_LOW_BATTERY = f " sum(ananke_ups_low_battery {{ { ANANKE_SELECTOR } }} ) or on() vector(0) "
ANANKE_UPS_RUNTIME_MIN = f " min(ananke_ups_runtime_seconds {{ { ANANKE_SELECTOR } }} ) or on() vector(0) "
ANANKE_UPS_RUNTIME_HEADROOM_PERCENT = (
f " 100 * min(ananke_ups_runtime_seconds {{ { ANANKE_SELECTOR } }} ) / "
f " clamp_min(max(ananke_ups_threshold_seconds {{ { ANANKE_SELECTOR } }} ), 1) "
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_TRIGGER_COUNT_1D = f " increase(ananke_shutdown_triggers_total {{ { ANANKE_SELECTOR } }} [1d]) or on() vector(0) "
ANANKE_UPS_RUNTIME_DB = (
f ' max(ananke_ups_runtime_seconds {{ { ANANKE_UPS_DB_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_RUNTIME_TETHYS = (
f ' max(ananke_ups_runtime_seconds {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_ON_BATTERY_DB = (
f ' max(ananke_ups_on_battery {{ { ANANKE_UPS_DB_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_ON_BATTERY_TETHYS = (
f ' max(ananke_ups_on_battery {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_BATTERY_CHARGE_DB = (
f ' max(ananke_ups_battery_charge_percent {{ { ANANKE_UPS_DB_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_BATTERY_CHARGE_TETHYS = (
f ' max(ananke_ups_battery_charge_percent {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_LOAD_DB = (
f ' max(ananke_ups_load_percent {{ { ANANKE_UPS_DB_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_LOAD_TETHYS = (
f ' max(ananke_ups_load_percent {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_DRAW_WATTS_DB = (
f ' max((ananke_ups_load_percent {{ { ANANKE_UPS_DB_SELECTOR } }} '
f ' * ananke_ups_power_nominal_watts {{ { ANANKE_UPS_DB_SELECTOR } }} ) / 100) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_DRAW_WATTS_TETHYS = (
f ' max((ananke_ups_load_percent {{ { ANANKE_UPS_TETHYS_SELECTOR } }} '
f ' * ananke_ups_power_nominal_watts {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) / 100) or on() vector(0) '
)
ANANKE_UPS_DRAW_WATTS_DB_SERIES = (
f ' ((ananke_ups_load_percent {{ { ANANKE_UPS_DB_SELECTOR } }} '
f ' * ananke_ups_power_nominal_watts {{ { ANANKE_UPS_DB_SELECTOR } }} ) / 100) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES = (
f ' ((ananke_ups_load_percent {{ { ANANKE_UPS_TETHYS_SELECTOR } }} '
f ' * ananke_ups_power_nominal_watts {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) / 100) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_RUNTIME_BY_SOURCE = f " ananke_ups_runtime_seconds {{ { ANANKE_SELECTOR } }} "
ANANKE_UPS_LOAD_BY_SOURCE = f " ananke_ups_load_percent {{ { ANANKE_SELECTOR } }} "
ANANKE_UPS_CHARGE_BY_SOURCE = f " ananke_ups_battery_charge_percent {{ { ANANKE_SELECTOR } }} "
ANANKE_UPS_TRIGGER_BY_SOURCE = f " ananke_ups_trigger_active {{ { ANANKE_SELECTOR } }} "
2026-04-12 17:20:05 -03:00
CLIMATE_SENSOR_COUNT = " count(typhon_temperature_celsius) or on() vector(0) "
2026-04-19 14:18:41 -03:00
CLIMATE_TEMP_MAX = " max(typhon_temperature_celsius) or on() vector(0) "
CLIMATE_PRESSURE_CURRENT = " max(typhon_vpd_kpa) or on() vector(0) "
CLIMATE_HUMIDITY_MAX = " max(typhon_relative_humidity_percent) or on() vector(0) "
CLIMATE_TEMP_SERIES = " typhon_temperature_celsius "
CLIMATE_PRESSURE_SERIES = " typhon_vpd_kpa "
CLIMATE_HUMIDITY_SERIES = " typhon_relative_humidity_percent "
CLIMATE_DEWPOINT_SERIES = (
" (243.12 * (ln(clamp_min(typhon_relative_humidity_percent, 1) / 100) + "
" (17.62 * typhon_temperature_celsius) / (243.12 + typhon_temperature_celsius))) / "
" (17.62 - (ln(clamp_min(typhon_relative_humidity_percent, 1) / 100) + "
" (17.62 * typhon_temperature_celsius) / (243.12 + typhon_temperature_celsius))) "
)
CLIMATE_DEWPOINT_CURRENT = f " max( { CLIMATE_DEWPOINT_SERIES } ) or on() vector(0) "
2026-04-03 20:45:40 -03:00
CLIMATE_FAN_OUTLET_CURRENT = (
2026-04-19 14:18:41 -03:00
' max(typhon_fan_speed_level { fan_group= " outlet " }) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_INSIDE_INLET_CURRENT = (
2026-04-19 14:18:41 -03:00
' max(typhon_fan_speed_level { fan_group= " inside_inlet " }) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_OUTSIDE_INLET_CURRENT = (
2026-04-19 14:18:41 -03:00
' max(typhon_fan_speed_level { fan_group= " outside_inlet " }) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_INTERIOR_CURRENT = (
2026-04-19 14:18:41 -03:00
' max(typhon_fan_speed_level { fan_group= " interior " }) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_OUTLET_SERIES = (
2026-04-19 14:18:41 -03:00
' typhon_fan_speed_level { fan_group= " outlet " } '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_INSIDE_INLET_SERIES = (
2026-04-19 14:18:41 -03:00
' typhon_fan_speed_level { fan_group= " inside_inlet " } '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_OUTSIDE_INLET_SERIES = (
2026-04-19 14:18:41 -03:00
' typhon_fan_speed_level { fan_group= " outside_inlet " } '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_INTERIOR_SERIES = (
2026-04-19 14:18:41 -03:00
' typhon_fan_speed_level { fan_group= " interior " } '
2026-04-12 22:07:58 -03:00
)
2026-01-22 18:23:17 -03:00
POSTGRES_CONN_USED = (
' label_replace(sum(pg_stat_activity_count), " conn " , " used " , " __name__ " , " .* " ) '
' or label_replace(max(pg_settings_max_connections), " conn " , " max " , " __name__ " , " .* " ) '
2026-01-22 15:23:23 -03:00
)
POSTGRES_CONN_HOTTEST = ' topk(1, sum by (datname) (pg_stat_activity_count)) '
2026-01-21 13:37:36 -03:00
ONEOFF_JOB_OWNER = (
' label_replace(kube_job_owner { owner_kind= " CronJob " }, " owner_name " , " $1 " , " job_name " , " (.*) " ) '
)
ONEOFF_JOB_PODS = f ' (kube_pod_owner {{ owner_kind= " Job " }} unless on(namespace, owner_name) { ONEOFF_JOB_OWNER } ) '
ONEOFF_JOB_POD_AGE_HOURS = (
' ((time() - kube_pod_start_time { pod!= " " }) / 3600) '
f ' * on(namespace,pod) group_left(owner_name) { ONEOFF_JOB_PODS } '
' * on(namespace,pod) group_left(phase) '
' max by (namespace,pod,phase) (kube_pod_status_phase { phase=~ " Running|Succeeded " }) '
)
2025-11-18 10:47:24 -03:00
GPU_NODES = [ " titan-20 " , " titan-21 " , " titan-22 " , " titan-24 " ]
GPU_NODE_REGEX = " | " . join ( GPU_NODES )
2025-11-17 18:55:11 -03:00
TRAEFIK_ROUTER_EXPR = " sum by (router) (rate(traefik_router_requests_total[5m])) "
2025-11-18 14:08:33 -03:00
TRAEFIK_NET_INGRESS = (
' sum(rate(container_network_receive_bytes_total { namespace= " traefik " ,pod=~ " traefik-.* " }[5m])) '
2025-11-18 11:30:33 -03:00
" or on() vector(0) "
)
2025-11-18 14:08:33 -03:00
TRAEFIK_NET_EGRESS = (
' sum(rate(container_network_transmit_bytes_total { namespace= " traefik " ,pod=~ " traefik-.* " }[5m])) '
" or on() vector(0) "
)
2025-11-18 15:55:24 -03:00
NET_CLUSTER_RX = (
' sum(rate(container_network_receive_bytes_total { namespace!= " " ,pod!= " " ,container!= " " }[5m])) '
" or on() vector(0) "
)
NET_CLUSTER_TX = (
2025-11-18 11:30:33 -03:00
' sum(rate(container_network_transmit_bytes_total { namespace!= " " ,pod!= " " ,container!= " " }[5m])) '
" or on() vector(0) "
)
2025-11-18 16:18:52 -03:00
PHYSICAL_NET_FILTER = ' device!~ " lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.* " '
NET_NODE_RX_PHYS = (
f ' sum(rate(node_network_receive_bytes_total {{ { PHYSICAL_NET_FILTER } }} [5m])) or on() vector(0) '
)
NET_NODE_TX_PHYS = (
f ' sum(rate(node_network_transmit_bytes_total {{ { PHYSICAL_NET_FILTER } }} [5m])) or on() vector(0) '
)
NET_TOTAL_EXPR = NET_NODE_TX_PHYS
NET_INGRESS_EXPR = NET_NODE_RX_PHYS
NET_EGRESS_EXPR = NET_NODE_TX_PHYS
NET_INTERNAL_EXPR = (
2025-11-18 17:09:13 -03:00
' sum(rate(container_network_receive_bytes_total { namespace!= " traefik " ,pod!= " " }[5m]) '
' + rate(container_network_transmit_bytes_total { namespace!= " traefik " ,pod!= " " }[5m])) '
2025-11-18 16:18:52 -03:00
' or on() vector(0) '
)
2025-12-12 18:00:43 -03:00
APISERVER_5XX_RATE = ' sum(rate(apiserver_request_total { code=~ " 5.. " }[5m])) '
APISERVER_P99_LATENCY_MS = (
" histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000 "
)
ETCD_P99_LATENCY_MS = (
" histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000 "
)
TRAEFIK_TOTAL_5M = " sum(rate(traefik_entrypoint_requests_total[5m])) "
TRAEFIK_SUCCESS_5M = ' sum(rate(traefik_entrypoint_requests_total { code!~ " 5.. " }[5m])) '
TRAEFIK_SLI_5M = f " ( { TRAEFIK_SUCCESS_5M } ) / clamp_min( { TRAEFIK_TOTAL_5M } , 1) "
TRAEFIK_P99_LATENCY_MS = (
" histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000 "
)
TRAEFIK_P95_LATENCY_MS = (
" histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000 "
)
SLO_AVAILABILITY = 0.999
def traefik_sli ( window ) :
total = f ' sum(rate(traefik_entrypoint_requests_total[ { window } ])) '
success = f ' sum(rate(traefik_entrypoint_requests_total {{ code!~ " 5.. " }} [ { window } ])) '
return f " ( { success } ) / clamp_min( { total } , 1) "
def traefik_burn ( window ) :
sli = traefik_sli ( window )
return f " (1 - ( { sli } )) / { 1 - SLO_AVAILABILITY } "
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Panel factories
# ---------------------------------------------------------------------------
2025-11-17 14:22:46 -03:00
2025-11-17 16:27:38 -03:00
def stat_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " none " ,
2025-12-12 16:15:37 -03:00
decimals = None ,
2025-11-17 16:27:38 -03:00
thresholds = None ,
text_mode = " value " ,
legend = None ,
2025-11-17 19:38:40 -03:00
instant = False ,
2025-11-17 16:27:38 -03:00
value_suffix = None ,
links = None ,
2026-04-03 20:45:40 -03:00
targets = None ,
field_overrides = None ,
description = None ,
2026-04-04 04:34:18 -03:00
orientation = None ,
2026-04-04 04:40:22 -03:00
wide_layout = None ,
2025-11-17 16:27:38 -03:00
) :
""" Return a Grafana stat panel definition. """
2025-11-17 14:22:46 -03:00
defaults = {
2025-12-12 20:44:20 -03:00
" color " : { " mode " : " thresholds " } ,
2025-11-17 14:22:46 -03:00
" mappings " : [ ] ,
" thresholds " : thresholds
or {
" mode " : " absolute " ,
" steps " : [
{ " color " : " rgba(115, 115, 115, 1) " , " value " : None } ,
{ " color " : " green " , " value " : 1 } ,
] ,
} ,
" unit " : unit ,
2025-11-17 16:27:38 -03:00
" custom " : { " displayMode " : " auto " } ,
2025-11-17 14:22:46 -03:00
}
2025-11-17 16:27:38 -03:00
if value_suffix :
defaults [ " custom " ] [ " valueSuffix " ] = value_suffix
2025-12-12 16:15:37 -03:00
if decimals is not None :
defaults [ " decimals " ] = decimals
2026-04-03 20:45:40 -03:00
target_list = targets if targets is not None else [ { " expr " : expr , " refId " : " A " } ]
2025-11-17 14:22:46 -03:00
panel = {
" id " : panel_id ,
" type " : " stat " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2026-04-03 20:45:40 -03:00
" targets " : target_list ,
" fieldConfig " : { " defaults " : defaults , " overrides " : field_overrides or [ ] } ,
2025-11-17 14:22:46 -03:00
" options " : {
" colorMode " : " value " ,
2026-04-19 14:18:41 -03:00
" graphMode " : " area " ,
" justifyMode " : " center " ,
2025-11-17 14:22:46 -03:00
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" textMode " : text_mode ,
} ,
}
2026-04-04 04:34:18 -03:00
if orientation :
panel [ " options " ] [ " orientation " ] = orientation
2026-04-04 04:40:22 -03:00
if wide_layout is not None :
panel [ " options " ] [ " wideLayout " ] = wide_layout
2026-04-03 20:45:40 -03:00
if legend and len ( panel [ " targets " ] ) == 1 :
2025-11-17 14:22:46 -03:00
panel [ " targets " ] [ 0 ] [ " legendFormat " ] = legend
2025-11-17 19:38:40 -03:00
if instant :
2026-04-03 20:45:40 -03:00
for t in panel [ " targets " ] :
t . setdefault ( " instant " , True )
2025-11-17 16:27:38 -03:00
if links :
panel [ " links " ] = links
2026-04-03 20:45:40 -03:00
if description :
panel [ " description " ] = description
2025-11-17 14:22:46 -03:00
return panel
2025-11-18 12:11:47 -03:00
def gauge_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
min_value = 0 ,
max_value = 1 ,
thresholds = None ,
links = None ,
) :
return {
" id " : panel_id ,
" type " : " gauge " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
" targets " : [ { " expr " : expr , " refId " : " A " } ] ,
" fieldConfig " : {
" defaults " : {
" min " : min_value ,
" max " : max_value ,
" thresholds " : thresholds
or {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : max_value } ,
] ,
} ,
} ,
" overrides " : [ ] ,
} ,
" options " : {
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" orientation " : " auto " ,
" showThresholdMarkers " : False ,
" showThresholdLabels " : False ,
} ,
* * ( { " links " : links } if links else { } ) ,
}
2025-11-17 16:27:38 -03:00
def timeseries_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " none " ,
2026-01-21 15:01:02 -03:00
max_value = None ,
2025-11-17 16:27:38 -03:00
legend = None ,
legend_display = " table " ,
legend_placement = " bottom " ,
legend_calcs = None ,
time_from = None ,
links = None ,
2026-04-03 20:45:40 -03:00
targets = None ,
field_overrides = None ,
description = None ,
2025-11-17 16:27:38 -03:00
) :
""" Return a Grafana time-series panel definition. """
2026-04-03 20:45:40 -03:00
target_list = targets if targets is not None else [ { " expr " : expr , " refId " : " A " } ]
2025-11-17 14:22:46 -03:00
panel = {
" id " : panel_id ,
" type " : " timeseries " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2026-04-03 20:45:40 -03:00
" targets " : target_list ,
" fieldConfig " : { " defaults " : { " unit " : unit } , " overrides " : field_overrides or [ ] } ,
2025-11-17 14:22:46 -03:00
" options " : {
" legend " : {
" displayMode " : legend_display ,
" placement " : legend_placement ,
} ,
" tooltip " : { " mode " : " multi " } ,
} ,
}
2026-01-21 15:01:02 -03:00
if max_value is not None :
panel [ " fieldConfig " ] [ " defaults " ] [ " max " ] = max_value
2026-04-03 20:45:40 -03:00
if legend and len ( panel [ " targets " ] ) == 1 :
2025-11-17 14:22:46 -03:00
panel [ " targets " ] [ 0 ] [ " legendFormat " ] = legend
if legend_calcs :
panel [ " options " ] [ " legend " ] [ " calcs " ] = legend_calcs
if time_from :
panel [ " timeFrom " ] = time_from
2025-11-17 16:27:38 -03:00
if links :
panel [ " links " ] = links
2026-04-03 20:45:40 -03:00
if description :
panel [ " description " ] = description
2025-11-17 14:22:46 -03:00
return panel
2025-11-17 16:27:38 -03:00
def table_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " none " ,
transformations = None ,
2025-12-13 04:00:57 -03:00
instant = False ,
2025-12-13 17:32:19 -03:00
options = None ,
2025-12-13 17:55:52 -03:00
filterable = True ,
2025-12-13 18:03:51 -03:00
footer = None ,
2025-12-13 18:23:19 -03:00
format = None ,
2026-04-12 19:46:39 -03:00
description = None ,
2025-11-17 16:27:38 -03:00
) :
""" Return a Grafana table panel definition. """
2025-12-13 11:53:27 -03:00
# Optional PromQL subquery helpers in expr: share(), etc.
2025-12-13 17:35:52 -03:00
panel_options = { " showHeader " : True , " columnFilters " : False }
2025-12-13 17:32:19 -03:00
if options :
panel_options . update ( options )
2025-12-13 18:03:51 -03:00
if footer is not None :
panel_options [ " footer " ] = footer
2025-12-13 17:55:52 -03:00
field_defaults = { " unit " : unit , " custom " : { " filterable " : filterable } }
2026-04-19 14:18:41 -03:00
target = { " expr " : expr , " refId " : " A " , * * ( { " instant " : True } if instant else { } ) }
2025-12-13 18:23:19 -03:00
if format :
2026-04-19 14:18:41 -03:00
target [ " format " ] = format
2025-11-17 14:22:46 -03:00
panel = {
" id " : panel_id ,
" type " : " table " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2026-04-19 14:18:41 -03:00
" targets " : [ target ] ,
" fieldConfig " : { " defaults " : field_defaults , " overrides " : [ ] } ,
2025-12-13 17:32:19 -03:00
" options " : panel_options ,
2025-11-17 14:22:46 -03:00
}
if transformations :
panel [ " transformations " ] = transformations
2026-04-12 19:46:39 -03:00
if description :
panel [ " description " ] = description
2025-11-17 14:22:46 -03:00
return panel
2026-04-19 14:18:41 -03:00
def pie_panel ( panel_id , title , expr , grid , * , links = None , description = None ) :
2025-11-17 16:27:38 -03:00
""" Return a pie chart panel with readable namespace labels. """
2026-01-01 14:44:33 -03:00
panel = {
2025-11-17 14:22:46 -03:00
" id " : panel_id ,
" type " : " piechart " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2026-04-19 14:18:41 -03:00
" targets " : [ { " expr " : expr , " refId " : " A " , " legendFormat " : " {{ namespace}} " } ] ,
2025-11-18 14:08:33 -03:00
" fieldConfig " : {
" defaults " : {
2026-04-19 14:18:41 -03:00
" unit " : " percent " ,
2025-11-18 14:08:33 -03:00
" color " : { " mode " : " palette-classic " } ,
} ,
" overrides " : [ ] ,
} ,
2025-11-17 14:22:46 -03:00
" options " : {
" legend " : { " displayMode " : " list " , " placement " : " right " } ,
" pieType " : " pie " ,
2025-12-12 20:40:32 -03:00
" displayLabels " : [ ] ,
2025-11-18 14:08:33 -03:00
" tooltip " : { " mode " : " single " } ,
" colorScheme " : " interpolateSpectral " ,
" colorBy " : " value " ,
2025-11-17 14:22:46 -03:00
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
} ,
}
2026-01-01 14:44:33 -03:00
if links :
panel [ " links " ] = links
if description :
panel [ " description " ] = description
return panel
2025-11-17 14:22:46 -03:00
2026-01-01 14:44:33 -03:00
def namespace_scope_variable ( var_name , label ) :
2026-01-01 14:16:08 -03:00
options = [
{
" text " : " workload namespaces only " ,
" value " : NAMESPACE_SCOPE_WORKLOAD ,
" selected " : True ,
} ,
{ " text " : " all namespaces " , " value " : NAMESPACE_SCOPE_ALL , " selected " : False } ,
{
" text " : " infrastructure namespaces only " ,
" value " : NAMESPACE_SCOPE_INFRA ,
" selected " : False ,
} ,
]
query = (
" workload namespaces only : "
+ NAMESPACE_SCOPE_WORKLOAD
+ " ,all namespaces : "
+ NAMESPACE_SCOPE_ALL
+ " ,infrastructure namespaces only : "
+ NAMESPACE_SCOPE_INFRA
)
return {
2026-01-01 14:44:33 -03:00
" name " : var_name ,
" label " : label ,
2026-01-01 14:16:08 -03:00
" type " : " custom " ,
" query " : query ,
" current " : { " text " : options [ 0 ] [ " text " ] , " value " : options [ 0 ] [ " value " ] , " selected " : True } ,
" options " : options ,
2026-01-01 14:44:33 -03:00
" hide " : 2 ,
2026-01-01 14:16:08 -03:00
" multi " : False ,
" includeAll " : False ,
" refresh " : 1 ,
" sort " : 0 ,
" skipUrlSync " : False ,
}
2026-04-19 14:18:41 -03:00
def namespace_scope_links ( var_name ) :
def with_value ( value ) :
encoded = urllib . parse . quote ( value , safe = " " )
params = [ ]
for other in NAMESPACE_SCOPE_VARS :
if other == var_name :
params . append ( f " var- { other } = { encoded } " )
else :
params . append ( f " var- { other } =$ {{ { other } }} " )
return " ? " + " & " . join ( params )
return [
{ " title " : " Workload namespaces only " , " url " : with_value ( NAMESPACE_SCOPE_WORKLOAD ) , " targetBlank " : False } ,
{ " title " : " All namespaces " , " url " : with_value ( NAMESPACE_SCOPE_ALL ) , " targetBlank " : False } ,
{
" title " : " Infrastructure namespaces only " ,
" url " : with_value ( NAMESPACE_SCOPE_INFRA ) ,
" targetBlank " : False ,
} ,
]
2026-04-18 17:47:06 -03:00
def testing_suite_variable ( ) :
options = [
{
" text " : suite ,
" value " : PLATFORM_TEST_SUITE_VALUE_BY_NAME . get ( suite , suite ) ,
" selected " : False ,
}
for suite in PLATFORM_TEST_SUITE_NAMES
]
query = " , " . join (
f " { suite } : { PLATFORM_TEST_SUITE_VALUE_BY_NAME . get ( suite , suite ) } "
for suite in PLATFORM_TEST_SUITE_NAMES
)
return {
" name " : " suite " ,
" label " : " Suite " ,
" type " : " custom " ,
" query " : query ,
" current " : { " text " : " All " , " value " : " $__all " , " selected " : True } ,
" options " : options ,
" hide " : 0 ,
" multi " : False ,
" includeAll " : True ,
" allValue " : PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER ,
" refresh " : 1 ,
" sort " : 1 ,
" skipUrlSync " : False ,
}
2026-04-20 08:35:05 -03:00
def testing_case_variable ( ) :
return {
" name " : " test " ,
" label " : " Test Case " ,
" type " : " query " ,
2026-04-20 09:13:34 -03:00
" query " : f ' label_values(platform_quality_gate_test_case_result {{ suite=~ " $ {{ suite:regex }} " ,test!= " __no_test_cases__ " , { PLATFORM_TEST_EXPORT_FILTER } }} , test) ' ,
2026-04-20 08:35:05 -03:00
" current " : { " text " : " All " , " value " : " $__all " , " selected " : True } ,
" options " : [ ] ,
" hide " : 0 ,
" multi " : False ,
" includeAll " : True ,
" allValue " : " .* " ,
" refresh " : 2 ,
" sort " : 1 ,
" skipUrlSync " : False ,
}
2026-04-21 09:35:43 -03:00
def testing_branch_variable ( ) :
return {
" name " : " branch " ,
" label " : " Branch " ,
" type " : " query " ,
" query " : f ' label_values(platform_quality_gate_build_info {{ suite=~ " $ {{ suite:regex }} " , { PLATFORM_TEST_EXPORT_FILTER } }} , branch) ' ,
" current " : { " text " : " All " , " value " : " $__all " , " selected " : True } ,
" options " : [ ] ,
" hide " : 0 ,
" multi " : False ,
" includeAll " : True ,
" allValue " : " .* " ,
" refresh " : 2 ,
" sort " : 1 ,
" skipUrlSync " : False ,
}
2026-04-20 13:45:01 -03:00
def jenkins_base_variable ( ) :
return {
" name " : " jenkins_base " ,
" label " : " Jenkins Base URL " ,
" type " : " textbox " ,
" query " : JENKINS_UI_BASE_DEFAULT ,
" current " : {
" text " : JENKINS_UI_BASE_DEFAULT ,
" value " : JENKINS_UI_BASE_DEFAULT ,
" selected " : True ,
} ,
" hide " : 0 ,
" skipUrlSync " : False ,
}
def jenkins_suite_links ( base_var = " $ {jenkins_base} " ) :
links = [ { " title " : " Open Jenkins " , " url " : f " { base_var } / " , " targetBlank " : True } ]
for suite in PLATFORM_TEST_SUITE_NAMES :
job = PLATFORM_TEST_JENKINS_JOB_BY_SUITE . get ( suite , suite )
encoded_job = urllib . parse . quote ( job , safe = " " )
links . append (
{
" title " : f " { suite } : Job " ,
" url " : f " { base_var } /job/ { encoded_job } / " ,
" targetBlank " : True ,
}
)
links . append (
{
" title " : f " { suite } : Last Artifacts " ,
" url " : f " { base_var } /job/ { encoded_job } /lastCompletedBuild/artifact/ " ,
" targetBlank " : True ,
}
)
return links
2025-12-12 20:20:13 -03:00
def bargauge_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " none " ,
2026-01-21 11:29:29 -03:00
legend = None ,
2025-12-12 20:20:13 -03:00
links = None ,
limit = None ,
2026-01-21 11:29:29 -03:00
sort_order = " desc " ,
2025-12-12 20:20:13 -03:00
thresholds = None ,
decimals = None ,
2025-12-12 20:30:00 -03:00
instant = False ,
2026-01-21 15:01:02 -03:00
overrides = None ,
2025-12-12 20:20:13 -03:00
) :
2025-12-02 13:16:00 -03:00
""" Return a bar gauge panel with label-aware reduction. """
2026-01-21 15:12:53 -03:00
cleaned_expr = expr . strip ( )
if not cleaned_expr . startswith ( ( " sort( " , " sort_desc( " ) ) :
if sort_order == " desc " :
expr = f " sort_desc( { expr } ) "
elif sort_order == " asc " :
expr = f " sort( { expr } ) "
2025-12-02 13:16:00 -03:00
panel = {
" id " : panel_id ,
" type " : " bargauge " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2025-12-12 20:30:00 -03:00
" targets " : [
2026-01-21 11:29:29 -03:00
{
" expr " : expr ,
" refId " : " A " ,
" legendFormat " : legend or " {{ node}} " ,
* * ( { " instant " : True } if instant else { } ) ,
}
2025-12-12 20:30:00 -03:00
] ,
2025-12-02 13:16:00 -03:00
" fieldConfig " : {
" defaults " : {
" unit " : unit ,
" min " : 0 ,
" max " : 100 if unit == " percent " else None ,
2025-12-12 20:20:13 -03:00
" thresholds " : thresholds
or {
2025-12-02 13:16:00 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 70 } ,
{ " color " : " red " , " value " : 85 } ,
] ,
} ,
} ,
" overrides " : [ ] ,
} ,
" options " : {
" displayMode " : " gradient " ,
" orientation " : " horizontal " ,
" reduceOptions " : {
" calcs " : [ " lastNotNull " ] ,
2025-12-02 14:56:36 -03:00
" fields " : " " ,
2025-12-02 13:16:00 -03:00
" values " : False ,
} ,
} ,
}
2026-01-21 15:01:02 -03:00
if overrides :
panel [ " fieldConfig " ] [ " overrides " ] . extend ( overrides )
2025-12-12 20:20:13 -03:00
if decimals is not None :
panel [ " fieldConfig " ] [ " defaults " ] [ " decimals " ] = decimals
2025-12-02 13:16:00 -03:00
if links :
panel [ " links " ] = links
2025-12-12 18:51:43 -03:00
# Keep bars ordered by value descending for readability.
panel [ " transformations " ] = [
{
" id " : " sortBy " ,
2026-01-21 11:29:29 -03:00
" options " : { " fields " : [ " Value " ] , " order " : sort_order } ,
2025-12-12 18:51:43 -03:00
}
]
2025-12-12 18:56:13 -03:00
if limit :
panel [ " transformations " ] . append ( { " id " : " limit " , " options " : { " limit " : limit } } )
2025-12-02 13:16:00 -03:00
return panel
2026-04-19 14:18:41 -03:00
def text_panel ( panel_id , title , content , grid ) :
return {
" id " : panel_id ,
" type " : " text " ,
" title " : title ,
" gridPos " : grid ,
" datasource " : None ,
" options " : { " mode " : " markdown " , " content " : content } ,
}
2026-04-13 00:25:33 -03:00
2026-04-19 14:18:41 -03:00
DASHBOARD_LINK_TITLES = {
" atlas-overview " : " Open Atlas Overview " ,
" atlas-pods " : " Open Atlas Pods " ,
" atlas-nodes " : " Open Atlas Nodes " ,
" atlas-storage " : " Open Atlas Storage " ,
" atlas-network " : " Open Atlas Network " ,
" atlas-mail " : " Open Atlas Mail " ,
" atlas-jobs " : " Open Atlas Testing " ,
" atlas-power " : " Open Atlas Power " ,
" atlas-gpu " : " Open Atlas GPU " ,
}
2026-04-13 23:13:45 -03:00
2026-04-19 14:18:41 -03:00
def link_to ( uid ) :
return [
{
" title " : DASHBOARD_LINK_TITLES . get ( uid , f " Open { uid } dashboard " ) ,
" url " : f " /d/ { uid } " ,
" targetBlank " : True ,
}
]
2026-04-13 23:13:45 -03:00
2026-04-19 14:18:41 -03:00
# ---------------------------------------------------------------------------
# Dashboard builders
# ---------------------------------------------------------------------------
def build_overview ( ) :
panels = [ ]
count_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 15:23:51 -03:00
] ,
}
2026-01-21 13:37:36 -03:00
age_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 6 } ,
{ " color " : " orange " , " value " : 24 } ,
{ " color " : " red " , " value " : 48 } ,
] ,
}
2025-11-18 15:55:24 -03:00
2025-12-12 15:23:51 -03:00
row1_stats = [
{
" id " : 2 ,
" title " : " Control Plane Ready " ,
" expr " : f ' sum(kube_node_status_condition {{ condition= " Ready " ,status= " true " ,node=~ " { CONTROL_REGEX } " }} ) ' ,
" kind " : " gauge " ,
" max_value " : CONTROL_TOTAL ,
" thresholds " : {
2025-11-17 19:24:03 -03:00
" mode " : " absolute " ,
" steps " : [
2025-11-18 11:12:03 -03:00
{ " color " : " red " , " value " : None } ,
2025-11-17 19:24:03 -03:00
{ " color " : " green " , " value " : CONTROL_TOTAL } ,
] ,
2025-12-12 15:23:51 -03:00
} ,
} ,
{
" id " : 3 ,
" title " : " Control Plane Workloads " ,
" expr " : CONTROL_WORKLOADS_EXPR ,
" kind " : " stat " ,
2025-12-12 20:30:00 -03:00
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
2025-12-12 20:50:41 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 20:30:00 -03:00
] ,
} ,
2025-12-12 15:23:51 -03:00
" links " : link_to ( " atlas-pods " ) ,
} ,
2025-12-12 15:56:33 -03:00
{
" id " : 5 ,
" title " : " Stuck Terminating " ,
" expr " : STUCK_TERMINATING_EXPR ,
" kind " : " stat " ,
2025-12-12 20:30:00 -03:00
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
2025-12-12 20:50:41 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 20:30:00 -03:00
] ,
} ,
2025-12-12 15:56:33 -03:00
" links " : link_to ( " atlas-pods " ) ,
} ,
2025-12-12 15:23:51 -03:00
{
" id " : 27 ,
2025-12-19 13:46:34 -03:00
" title " : " Atlas Availability " ,
2025-12-12 16:11:28 -03:00
" expr " : UPTIME_PERCENT_EXPR ,
2025-12-12 15:23:51 -03:00
" kind " : " stat " ,
2025-12-12 16:11:28 -03:00
" thresholds " : UPTIME_PERCENT_THRESHOLDS ,
2025-12-12 16:15:37 -03:00
" unit " : " percentunit " ,
2025-12-19 15:18:14 -03:00
" decimals " : 4 ,
2025-12-12 15:23:51 -03:00
" text_mode " : " value " ,
} ,
{
" id " : 4 ,
" title " : " Problem Pods " ,
" expr " : PROBLEM_PODS_EXPR ,
" kind " : " stat " ,
2025-12-12 20:30:00 -03:00
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
2025-12-12 20:50:41 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 20:30:00 -03:00
] ,
} ,
2025-12-12 15:23:51 -03:00
" links " : link_to ( " atlas-pods " ) ,
} ,
{
" id " : 6 ,
" title " : " CrashLoop / ImagePull " ,
" expr " : CRASHLOOP_EXPR ,
" kind " : " stat " ,
2025-12-12 20:30:00 -03:00
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
2025-12-12 20:50:41 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 20:30:00 -03:00
] ,
} ,
2025-12-12 15:23:51 -03:00
" links " : link_to ( " atlas-pods " ) ,
} ,
{
2025-12-12 15:56:33 -03:00
" id " : 1 ,
" title " : " Workers Ready " ,
" expr " : f ' sum(kube_node_status_condition {{ condition= " Ready " ,status= " true " ,node=~ " { WORKER_REGEX } " }} ) ' ,
" kind " : " gauge " ,
" max_value " : WORKER_TOTAL ,
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " orange " , " value " : WORKER_TOTAL - 2 } ,
{ " color " : " yellow " , " value " : WORKER_TOTAL - 1 } ,
{ " color " : " green " , " value " : WORKER_TOTAL } ,
] ,
} ,
2025-12-12 15:23:51 -03:00
} ,
]
def gauge_grid ( idx ) :
width = GAUGE_WIDTHS [ idx ] if idx < len ( GAUGE_WIDTHS ) else 4
x = sum ( GAUGE_WIDTHS [ : idx ] )
return width , x
for idx , item in enumerate ( row1_stats ) :
panel_id = item [ " id " ]
2025-11-18 15:55:24 -03:00
width , x = gauge_grid ( idx )
2025-12-12 15:23:51 -03:00
grid = { " h " : 5 , " w " : width , " x " : x , " y " : 0 }
kind = item . get ( " kind " , " gauge " )
if kind == " stat " :
2025-11-18 17:09:13 -03:00
panels . append (
stat_panel (
panel_id ,
2025-12-12 15:23:51 -03:00
item [ " title " ] ,
item [ " expr " ] ,
grid ,
thresholds = item . get ( " thresholds " ) ,
2025-12-12 16:15:37 -03:00
legend = None ,
links = item . get ( " links " ) ,
text_mode = item . get ( " text_mode " , " value " ) ,
value_suffix = item . get ( " value_suffix " ) ,
unit = item . get ( " unit " , " none " ) ,
decimals = item . get ( " decimals " ) ,
)
)
2025-11-18 17:09:13 -03:00
else :
panels . append (
gauge_panel (
panel_id ,
2025-12-12 15:23:51 -03:00
item [ " title " ] ,
item [ " expr " ] ,
grid ,
min_value = 0 ,
max_value = item . get ( " max_value " , 5 ) ,
thresholds = item . get ( " thresholds " ) ,
links = item . get ( " links " ) ,
2025-11-18 17:09:13 -03:00
)
2025-11-17 14:22:46 -03:00
)
2025-11-17 16:27:38 -03:00
2026-04-09 14:56:43 -03:00
top_health_panels = [
2025-11-17 21:20:19 -03:00
( 7 , " Hottest node: CPU " , topk_with_node ( node_cpu_expr ( ) ) , " percent " ) ,
( 8 , " Hottest node: RAM " , topk_with_node ( node_mem_expr ( ) ) , " percent " ) ,
2025-11-17 20:19:20 -03:00
( 9 , " Hottest node: NET (rx+tx) " , topk_with_node ( node_net_expr ( ) ) , " Bps " ) ,
( 10 , " Hottest node: I/O (r+w) " , topk_with_node ( node_io_expr ( ) ) , " Bps " ) ,
2026-04-09 14:56:43 -03:00
( 23 , " Astreae Usage " , astreae_usage_expr ( " /mnt/astreae " ) , " percent " ) ,
( 24 , " Asteria Usage " , astreae_usage_expr ( " /mnt/asteria " ) , " percent " ) ,
( 25 , " Astreae Free " , astreae_free_expr ( " /mnt/astreae " ) , " decbytes " ) ,
( 26 , " Asteria Free " , astreae_free_expr ( " /mnt/asteria " ) , " decbytes " ) ,
2025-11-17 16:27:38 -03:00
]
2026-04-09 14:56:43 -03:00
for idx , ( panel_id , title , expr , unit ) in enumerate ( top_health_panels ) :
is_hottest_panel = panel_id in { 7 , 8 , 9 , 10 }
2025-11-17 16:27:38 -03:00
panels . append (
stat_panel (
panel_id ,
title ,
2025-11-17 20:19:20 -03:00
f " { expr } " ,
2026-04-09 14:56:43 -03:00
{ " h " : 2 , " w " : 3 , " x " : 3 * idx , " y " : 5 } ,
2025-11-17 16:27:38 -03:00
unit = unit ,
thresholds = PERCENT_THRESHOLDS if unit == " percent " else None ,
2026-04-09 14:56:43 -03:00
text_mode = " name_and_value " if is_hottest_panel else " value " ,
legend = " {{ node}} " if is_hottest_panel else None ,
instant = is_hottest_panel ,
links = link_to ( " atlas-storage " if panel_id in { 23 , 24 , 25 , 26 } else " atlas-nodes " ) ,
2025-11-17 16:27:38 -03:00
)
)
2026-01-05 21:55:59 -03:00
mail_bounce_rate_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 5 } ,
{ " color " : " orange " , " value " : 8 } ,
{ " color " : " red " , " value " : 10 } ,
] ,
}
2026-01-06 02:06:20 -03:00
mail_limit_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 70 } ,
{ " color " : " orange " , " value " : 85 } ,
{ " color " : " red " , " value " : 95 } ,
] ,
}
2026-01-06 02:34:52 -03:00
mail_success_thresholds = {
2026-01-05 21:55:59 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
2026-01-06 02:34:52 -03:00
{ " color " : " orange " , " value " : 90 } ,
{ " color " : " yellow " , " value " : 95 } ,
{ " color " : " green " , " value " : 98 } ,
2026-01-05 21:55:59 -03:00
] ,
}
2026-04-19 14:18:41 -03:00
status_mapping = [
{
" type " : " value " ,
" options " : {
" 0 " : { " text " : " ⚡ Charging " } ,
" 1 " : { " text " : " 🔋 Discharging " } ,
} ,
}
]
2026-04-03 14:55:16 -03:00
panels . append (
2026-04-13 06:22:41 -03:00
stat_panel (
2026-04-03 14:55:16 -03:00
40 ,
2026-04-19 14:18:41 -03:00
" UPS Current Load " ,
None ,
{ " h " : 6 , " w " : 4 , " x " : 0 , " y " : 12 } ,
unit = " none " ,
decimals = 1 ,
2026-04-13 06:22:41 -03:00
text_mode = " name_and_value " ,
2026-04-19 14:18:41 -03:00
targets = [
{ " refId " : " A " , " expr " : ANANKE_UPS_DRAW_WATTS_DB , " legendFormat " : f " { ANANKE_UPS_DB_NAME } Draw (W) " , " instant " : True } ,
{ " refId " : " B " , " expr " : ANANKE_UPS_RUNTIME_DB , " legendFormat " : f " { ANANKE_UPS_DB_NAME } Discharge " , " instant " : True } ,
{ " refId " : " C " , " expr " : ANANKE_UPS_ON_BATTERY_DB , " legendFormat " : f " { ANANKE_UPS_DB_NAME } Status " , " instant " : True } ,
{ " refId " : " D " , " expr " : ANANKE_UPS_DRAW_WATTS_TETHYS , " legendFormat " : f " { ANANKE_UPS_TETHYS_NAME } Draw (W) " , " instant " : True } ,
{ " refId " : " E " , " expr " : ANANKE_UPS_RUNTIME_TETHYS , " legendFormat " : f " { ANANKE_UPS_TETHYS_NAME } Discharge " , " instant " : True } ,
{ " refId " : " F " , " expr " : ANANKE_UPS_ON_BATTERY_TETHYS , " legendFormat " : f " { ANANKE_UPS_TETHYS_NAME } Status " , " instant " : True } ,
2026-04-03 22:16:02 -03:00
] ,
2026-04-13 03:35:39 -03:00
field_overrides = [
2026-04-19 14:18:41 -03:00
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_DB_NAME } Draw (W) " } ,
" properties " : [ { " id " : " unit " , " value " : " watt " } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_DB_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_TETHYS_NAME } Draw (W) " } ,
" properties " : [ { " id " : " unit " , " value " : " watt " } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_TETHYS_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_DB_NAME } Discharge " } ,
" properties " : [ { " id " : " unit " , " value " : " s " } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_DB_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_TETHYS_NAME } Discharge " } ,
" properties " : [ { " id " : " unit " , " value " : " s " } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_TETHYS_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_DB_NAME } Status " } ,
" properties " : [ { " id " : " mappings " , " value " : status_mapping } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_DB_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_TETHYS_NAME } Status " } ,
" properties " : [ { " id " : " mappings " , " value " : status_mapping } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_TETHYS_NODE } " } ] ,
} ,
2026-04-13 01:08:58 -03:00
] ,
2026-04-19 14:18:41 -03:00
orientation = " horizontal " ,
wide_layout = True ,
2026-04-03 14:55:16 -03:00
links = link_to ( " atlas-power " ) ,
2026-04-19 14:18:41 -03:00
description = " Per-UPS live snapshot: current draw, discharge, and charging/discharging status. " ,
2026-04-03 14:55:16 -03:00
)
)
2026-04-12 22:25:34 -03:00
panels . append (
2026-04-03 22:16:02 -03:00
timeseries_panel (
2026-04-03 14:55:16 -03:00
41 ,
2026-04-03 22:16:02 -03:00
" UPS History (Power Draw) " ,
None ,
2026-04-19 14:18:41 -03:00
{ " h " : 6 , " w " : 4 , " x " : 4 , " y " : 12 } ,
2026-04-03 22:16:02 -03:00
unit = " watt " ,
targets = [
2026-04-08 23:33:17 -03:00
{ " refId " : " A " , " expr " : ANANKE_UPS_DRAW_WATTS_DB_SERIES , " legendFormat " : ANANKE_UPS_DB_NAME } ,
{ " refId " : " B " , " expr " : ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES , " legendFormat " : ANANKE_UPS_TETHYS_NAME } ,
2026-04-03 22:16:02 -03:00
] ,
2026-04-19 14:18:41 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
2026-04-03 14:55:16 -03:00
links = link_to ( " atlas-power " ) ,
)
)
panels . append (
2026-04-13 06:22:41 -03:00
stat_panel (
2026-04-03 14:55:16 -03:00
42 ,
2026-04-19 14:18:41 -03:00
" Current Climate " ,
None ,
{ " h " : 6 , " w " : 4 , " x " : 8 , " y " : 12 } ,
unit = " none " ,
decimals = 2 ,
text_mode = " value " ,
targets = [
{ " refId " : " A " , " expr " : CLIMATE_TEMP_MAX , " legendFormat " : " Tent Temp (°C) " , " instant " : True } ,
{ " refId " : " B " , " expr " : CLIMATE_PRESSURE_CURRENT , " legendFormat " : " Tent VPD (kPa) " , " instant " : True } ,
{ " refId " : " C " , " expr " : CLIMATE_HUMIDITY_MAX , " legendFormat " : " Tent RH ( % ) " , " instant " : True } ,
{ " refId " : " D " , " expr " : CLIMATE_DEWPOINT_CURRENT , " legendFormat " : " Dew Point (°C) " , " instant " : True } ,
2026-04-03 22:16:02 -03:00
] ,
2026-04-13 03:35:39 -03:00
field_overrides = [
2026-04-19 14:18:41 -03:00
{ " matcher " : { " id " : " byName " , " options " : " Tent Temp (°C) " } , " properties " : [ { " id " : " unit " , " value " : " celsius " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Tent VPD (kPa) " } , " properties " : [ { " id " : " unit " , " value " : " suffix:kPa " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Tent RH ( % ) " } , " properties " : [ { " id " : " unit " , " value " : " percent " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Dew Point (°C) " } , " properties " : [ { " id " : " unit " , " value " : " celsius " } ] } ,
2026-04-13 03:35:39 -03:00
] ,
links = link_to ( " atlas-power " ) ,
2026-04-19 14:18:41 -03:00
description = " Current tent temperature, humidity, VPD, and dew point. " ,
orientation = " horizontal " ,
wide_layout = True ,
2026-04-13 03:35:39 -03:00
)
)
2026-04-19 14:18:41 -03:00
panels . append (
timeseries_panel (
2026-04-03 22:16:02 -03:00
43 ,
2026-04-19 14:18:41 -03:00
" Climate History " ,
2026-04-03 22:16:02 -03:00
None ,
2026-04-19 14:18:41 -03:00
{ " h " : 6 , " w " : 4 , " x " : 12 , " y " : 12 } ,
unit = " celsius " ,
2026-04-03 22:16:02 -03:00
targets = [
2026-04-19 14:18:41 -03:00
{ " refId " : " A " , " expr " : CLIMATE_TEMP_SERIES , " legendFormat " : " Temperature (°C) " } ,
{ " refId " : " B " , " expr " : CLIMATE_HUMIDITY_SERIES , " legendFormat " : " Humidity ( % ) " } ,
{ " refId " : " C " , " expr " : CLIMATE_PRESSURE_SERIES , " legendFormat " : " VPD (kPa) " } ,
{ " refId " : " D " , " expr " : CLIMATE_DEWPOINT_SERIES , " legendFormat " : " Dew Point (°C) " } ,
2026-04-03 22:16:02 -03:00
] ,
field_overrides = [
2026-04-12 22:53:23 -03:00
{
2026-04-19 14:18:41 -03:00
" matcher " : { " id " : " byName " , " options " : " Humidity ( % ) " } ,
2026-04-13 00:17:29 -03:00
" properties " : [
2026-04-19 14:18:41 -03:00
{ " id " : " unit " , " value " : " percent " } ,
2026-04-13 00:17:29 -03:00
] ,
} ,
2026-04-12 17:28:15 -03:00
{
2026-04-19 14:18:41 -03:00
" matcher " : { " id " : " byName " , " options " : " VPD (kPa) " } ,
2026-04-03 22:16:02 -03:00
" properties " : [
2026-04-19 14:18:41 -03:00
{ " id " : " unit " , " value " : " none " } ,
2026-04-03 22:16:02 -03:00
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
2026-04-19 14:18:41 -03:00
{ " id " : " custom.axisLabel " , " value " : " kPa " } ,
2026-04-03 22:16:02 -03:00
{ " id " : " decimals " , " value " : 2 } ,
2026-04-13 00:17:29 -03:00
] ,
2026-04-19 14:18:41 -03:00
}
2026-04-03 22:16:02 -03:00
] ,
2026-04-12 18:35:15 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
2026-04-03 14:55:16 -03:00
links = link_to ( " atlas-power " ) ,
2026-04-19 14:18:41 -03:00
description = " Two-axis chart: tent temperature/humidity/dew point (left axis) and VPD in kPa (right axis). " ,
2026-04-03 22:16:02 -03:00
)
2026-04-19 14:18:41 -03:00
)
2026-04-03 22:16:02 -03:00
panels . append (
2026-04-12 19:56:12 -03:00
stat_panel (
2026-04-03 22:16:02 -03:00
140 ,
" Fan Activity " ,
2026-04-19 14:18:41 -03:00
None ,
{ " h " : 6 , " w " : 4 , " x " : 16 , " y " : 12 } ,
2026-04-12 19:56:12 -03:00
unit = " none " ,
decimals = 0 ,
text_mode = " name_and_value " ,
2026-04-19 14:18:41 -03:00
targets = [
{ " refId " : " A " , " expr " : f " round( { CLIMATE_FAN_OUTLET_CURRENT } ) " , " legendFormat " : " Inside Outlet " , " instant " : True } ,
{ " refId " : " B " , " expr " : f " round( { CLIMATE_FAN_INSIDE_INLET_CURRENT } ) " , " legendFormat " : " Inside Inlet " , " instant " : True } ,
{ " refId " : " C " , " expr " : f " round( { CLIMATE_FAN_OUTSIDE_INLET_CURRENT } ) " , " legendFormat " : " Outside Inlet " , " instant " : True } ,
{ " refId " : " D " , " expr " : f " round( { CLIMATE_FAN_INTERIOR_CURRENT } ) " , " legendFormat " : " Interior Fans " , " instant " : True } ,
] ,
2026-04-12 19:56:12 -03:00
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 7 } ,
{ " color " : " red " , " value " : 9 } ,
] ,
} ,
2026-04-19 14:18:41 -03:00
orientation = " horizontal " ,
wide_layout = True ,
2026-04-03 22:16:02 -03:00
links = link_to ( " atlas-power " ) ,
2026-04-03 14:55:16 -03:00
)
)
2026-04-03 22:16:02 -03:00
panels . append (
timeseries_panel (
141 ,
" Fan History (0-10) " ,
None ,
2026-04-19 14:18:41 -03:00
{ " h " : 6 , " w " : 4 , " x " : 20 , " y " : 12 } ,
2026-04-03 22:16:02 -03:00
unit = " none " ,
max_value = 10 ,
targets = [
2026-04-19 14:18:41 -03:00
{ " refId " : " A " , " expr " : CLIMATE_FAN_OUTLET_SERIES , " legendFormat " : " Inside Outlet " } ,
{ " refId " : " B " , " expr " : CLIMATE_FAN_INSIDE_INLET_SERIES , " legendFormat " : " Inside Inlet " } ,
{ " refId " : " C " , " expr " : CLIMATE_FAN_OUTSIDE_INLET_SERIES , " legendFormat " : " Outside Inlet " } ,
{ " refId " : " D " , " expr " : CLIMATE_FAN_INTERIOR_SERIES , " legendFormat " : " Interior Fans " } ,
2026-04-03 22:16:02 -03:00
] ,
2026-04-19 14:18:41 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
2026-04-03 22:16:02 -03:00
links = link_to ( " atlas-power " ) ,
)
2026-04-03 14:55:16 -03:00
)
panels . append (
2026-04-09 16:35:14 -03:00
bargauge_panel (
2026-04-03 14:55:16 -03:00
44 ,
2026-04-09 16:35:14 -03:00
" One-off Job Pods (age hours) " ,
ONEOFF_JOB_POD_AGE_HOURS ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 6 , " x " : 0 , " y " : 7 } ,
2026-04-04 01:33:15 -03:00
unit = " h " ,
2026-04-03 14:55:16 -03:00
instant = True ,
2026-04-09 16:35:14 -03:00
legend = " {{ namespace}}/ {{ pod}} " ,
thresholds = age_thresholds ,
limit = 12 ,
decimals = 2 ,
links = link_to ( " atlas-jobs " ) ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-09 16:35:14 -03:00
{
" id " : 45 ,
" type " : " timeseries " ,
" title " : " Ariadne Attempts / Failures " ,
" datasource " : PROM_DS ,
2026-04-19 14:18:41 -03:00
" gridPos " : { " h " : 5 , " w " : 6 , " x " : 6 , " y " : 7 } ,
2026-04-09 16:35:14 -03:00
" targets " : [
{ " expr " : ARIADNE_TASK_ATTEMPTS_SERIES , " refId " : " A " , " legendFormat " : " Attempts " } ,
{ " expr " : ARIADNE_TASK_FAILURES_SERIES , " refId " : " B " , " legendFormat " : " Failures " } ,
] ,
" fieldConfig " : {
" defaults " : { " unit " : " none " } ,
" overrides " : [
{
" matcher " : { " id " : " byName " , " options " : " Attempts " } ,
" properties " : [
{ " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " green " } }
] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : " Failures " } ,
" properties " : [
{ " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " red " } }
] ,
} ,
] ,
} ,
" options " : {
" legend " : { " displayMode " : " table " , " placement " : " right " } ,
" tooltip " : { " mode " : " multi " } ,
} ,
" links " : link_to ( " atlas-jobs " ) ,
}
2026-04-03 14:55:16 -03:00
)
2026-04-08 23:33:17 -03:00
test_success = timeseries_panel (
2026-04-03 14:55:16 -03:00
46 ,
" Platform Test Success Rate " ,
2026-04-08 23:33:17 -03:00
None ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 6 , " x " : 12 , " y " : 7 } ,
2026-04-03 14:55:16 -03:00
unit = " percent " ,
2026-04-09 15:21:59 -03:00
targets = PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS ,
2026-04-09 13:39:55 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2026-04-09 19:27:48 -03:00
legend_calcs = [ " lastNotNull " ] ,
2026-04-19 14:18:41 -03:00
links = link_to ( " atlas-jobs " ) ,
2026-04-03 14:55:16 -03:00
)
2026-04-09 14:56:43 -03:00
test_success [ " fieldConfig " ] [ " defaults " ] [ " min " ] = 0
test_success [ " fieldConfig " ] [ " defaults " ] [ " max " ] = 100
2026-04-09 16:35:14 -03:00
test_success [ " fieldConfig " ] [ " defaults " ] [ " custom " ] = {
" drawStyle " : " line " ,
" lineInterpolation " : " linear " ,
" lineWidth " : 2 ,
" fillOpacity " : 10 ,
" showPoints " : " always " ,
" pointSize " : 4 ,
" spanNulls " : True ,
}
2026-04-09 20:05:10 -03:00
test_success [ " timeFrom " ] = " 7d "
2026-04-03 14:55:16 -03:00
test_success [ " description " ] = (
2026-04-09 20:05:10 -03:00
" Per-run interval pass points (0-100) for each software suite over the last 7 days. Points are connected to show trend; missing-run intervals are ignored. "
2026-04-03 14:55:16 -03:00
)
panels . append ( test_success )
2026-04-09 19:27:48 -03:00
panels . append (
2026-04-09 20:16:44 -03:00
bargauge_panel (
2026-04-09 19:27:48 -03:00
47 ,
2026-04-11 11:54:43 -03:00
" PVC Backup Health / Age " ,
PVC_BACKUP_AGE_HOURS_BY_PVC ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 6 , " x " : 18 , " y " : 7 } ,
2026-04-11 11:54:43 -03:00
unit = " h " ,
2026-04-09 19:27:48 -03:00
instant = True ,
2026-04-11 11:54:43 -03:00
legend = " {{ namespace}}/ {{ pvc}} " ,
2026-04-09 20:16:44 -03:00
sort_order = " desc " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
2026-04-11 11:54:43 -03:00
{ " color " : " green " , " value " : None } ,
2026-04-14 02:14:43 -03:00
{ " color " : " yellow " , " value " : 20 } ,
{ " color " : " orange " , " value " : 40 } ,
{ " color " : " red " , " value " : 50 } ,
2026-04-09 20:16:44 -03:00
] ,
} ,
2026-04-09 19:27:48 -03:00
)
2026-04-03 14:55:16 -03:00
)
2026-04-11 11:54:43 -03:00
panels [ - 1 ] [ " links " ] = link_to ( " atlas-storage " )
2026-04-09 20:16:44 -03:00
panels [ - 1 ] [ " description " ] = (
2026-04-19 14:18:41 -03:00
" Oldest successful backup age in hours by PVC with nightly cadence thresholds (green <=20h, yellow <40h, orange <50h, red >=50h). PVCs with missing or unhealthy backup state are forced to 999h so critical bars stay visible. "
2026-04-09 20:16:44 -03:00
)
2026-04-03 14:55:16 -03:00
2026-01-06 02:34:52 -03:00
panels . append (
stat_panel (
2026-01-05 21:55:59 -03:00
30 ,
2026-01-06 02:34:52 -03:00
" Mail Sent (1d) " ,
' max(postmark_outbound_sent { window= " 1d " }) ' ,
2026-04-19 14:18:41 -03:00
{ " h " : 2 , " w " : 4 , " x " : 0 , " y " : 18 } ,
2026-01-06 02:34:52 -03:00
unit = " none " ,
links = link_to ( " atlas-mail " ) ,
)
)
panels . append (
{
" id " : 31 ,
" type " : " stat " ,
" title " : " Mail Bounces (1d) " ,
" datasource " : PROM_DS ,
2026-04-19 14:18:41 -03:00
" gridPos " : { " h " : 2 , " w " : 4 , " x " : 8 , " y " : 18 } ,
2026-01-06 02:34:52 -03:00
" targets " : [
{
" expr " : ' max(postmark_outbound_bounce_rate { window= " 1d " }) ' ,
" refId " : " A " ,
" legendFormat " : " Rate " ,
} ,
{
" expr " : ' max(postmark_outbound_bounced { window= " 1d " }) ' ,
" refId " : " B " ,
" legendFormat " : " Count " ,
} ,
] ,
" fieldConfig " : {
" defaults " : {
" color " : { " mode " : " thresholds " } ,
" custom " : { " displayMode " : " auto " } ,
" thresholds " : mail_bounce_rate_thresholds ,
" unit " : " none " ,
} ,
" overrides " : [
{
" matcher " : { " id " : " byName " , " options " : " Rate " } ,
" properties " : [ { " id " : " unit " , " value " : " percent " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : " Count " } ,
" properties " : [ { " id " : " unit " , " value " : " none " } ] ,
} ,
] ,
} ,
" options " : {
" colorMode " : " value " ,
" graphMode " : " area " ,
" justifyMode " : " center " ,
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" textMode " : " name_and_value " ,
} ,
" links " : link_to ( " atlas-mail " ) ,
}
)
panels . append (
stat_panel (
2026-01-05 21:55:59 -03:00
32 ,
2026-01-06 02:34:52 -03:00
" Mail Success Rate (1d) " ,
' clamp_min(100 - max(postmark_outbound_bounce_rate { window= " 1d " }), 0) ' ,
2026-04-19 14:18:41 -03:00
{ " h " : 2 , " w " : 4 , " x " : 4 , " y " : 18 } ,
2026-01-06 02:34:52 -03:00
unit = " percent " ,
thresholds = mail_success_thresholds ,
decimals = 1 ,
links = link_to ( " atlas-mail " ) ,
)
)
panels . append (
stat_panel (
33 ,
2026-01-06 02:06:20 -03:00
" Mail Limit Used (30d) " ,
" max(postmark_sending_limit_used_percent) " ,
2026-04-19 14:18:41 -03:00
{ " h " : 2 , " w " : 4 , " x " : 12 , " y " : 18 } ,
2026-01-06 02:34:52 -03:00
unit = " percent " ,
thresholds = mail_limit_thresholds ,
decimals = 1 ,
links = link_to ( " atlas-mail " ) ,
2026-01-05 21:55:59 -03:00
)
2026-01-06 02:34:52 -03:00
)
2026-01-22 15:23:23 -03:00
panels . append (
2026-01-22 18:23:17 -03:00
stat_panel (
2026-01-22 15:23:23 -03:00
34 ,
" Postgres Connections Used " ,
2026-01-22 18:23:17 -03:00
POSTGRES_CONN_USED ,
2026-04-19 14:18:41 -03:00
{ " h " : 2 , " w " : 4 , " x " : 16 , " y " : 18 } ,
2026-01-22 18:23:17 -03:00
decimals = 0 ,
text_mode = " name_and_value " ,
legend = " {{ conn}} " ,
instant = True ,
2026-01-22 15:23:23 -03:00
)
)
panels . append (
stat_panel (
35 ,
" Postgres Hottest Connections " ,
POSTGRES_CONN_HOTTEST ,
2026-04-19 14:18:41 -03:00
{ " h " : 2 , " w " : 4 , " x " : 20 , " y " : 18 } ,
2026-01-22 15:23:23 -03:00
unit = " none " ,
decimals = 0 ,
text_mode = " name_and_value " ,
legend = " {{ datname}} " ,
instant = True ,
)
)
2026-01-05 21:55:59 -03:00
2026-01-01 14:44:33 -03:00
cpu_scope = " $namespace_scope_cpu "
gpu_scope = " $namespace_scope_gpu "
ram_scope = " $namespace_scope_ram "
2025-11-17 14:22:46 -03:00
panels . append (
2025-11-17 16:27:38 -03:00
pie_panel (
11 ,
2025-12-02 14:41:39 -03:00
" Namespace CPU Share " ,
2026-01-01 14:44:33 -03:00
namespace_cpu_share_expr ( cpu_scope ) ,
2026-04-04 01:33:15 -03:00
{ " h " : 9 , " w " : 8 , " x " : 0 , " y " : 23 } ,
2026-01-01 14:44:33 -03:00
links = namespace_scope_links ( " namespace_scope_cpu " ) ,
2026-01-18 02:50:07 -03:00
description = " Shares are normalized within the selected filter. Switching scope changes the denominator. " ,
2025-11-17 23:12:16 -03:00
)
)
panels . append (
pie_panel (
2025-11-17 23:42:55 -03:00
12 ,
2025-12-02 14:41:39 -03:00
" Namespace GPU Share " ,
2026-01-01 14:44:33 -03:00
namespace_gpu_share_expr ( gpu_scope ) ,
2026-04-04 01:33:15 -03:00
{ " h " : 9 , " w " : 8 , " x " : 8 , " y " : 23 } ,
2026-01-01 14:44:33 -03:00
links = namespace_scope_links ( " namespace_scope_gpu " ) ,
2026-01-18 02:50:07 -03:00
description = " Shares are normalized within the selected filter. Switching scope changes the denominator. " ,
2025-11-18 00:11:39 -03:00
)
)
panels . append (
pie_panel (
13 ,
2025-12-02 14:41:39 -03:00
" Namespace RAM Share " ,
2026-01-01 14:44:33 -03:00
namespace_ram_share_expr ( ram_scope ) ,
2026-04-04 01:33:15 -03:00
{ " h " : 9 , " w " : 8 , " x " : 16 , " y " : 23 } ,
2026-01-01 14:44:33 -03:00
links = namespace_scope_links ( " namespace_scope_ram " ) ,
2026-01-18 02:50:07 -03:00
description = " Shares are normalized within the selected filter. Switching scope changes the denominator. " ,
2025-11-17 14:22:46 -03:00
)
)
2025-11-17 21:48:12 -03:00
worker_filter = f " { WORKER_REGEX } "
2025-11-17 14:22:46 -03:00
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
14 ,
2025-12-02 14:41:39 -03:00
" Worker Node CPU " ,
2025-11-17 21:48:12 -03:00
node_cpu_expr ( worker_filter ) ,
2026-04-19 14:18:41 -03:00
{ " h " : 12 , " w " : 12 , " x " : 0 , " y " : 39 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 16:27:38 -03:00
links = link_to ( " atlas-nodes " ) ,
2025-11-17 14:22:46 -03:00
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
15 ,
2025-12-02 14:41:39 -03:00
" Worker Node RAM " ,
2025-11-17 21:48:12 -03:00
node_mem_expr ( worker_filter ) ,
2026-04-19 14:18:41 -03:00
{ " h " : 12 , " w " : 12 , " x " : 12 , " y " : 39 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 16:27:38 -03:00
links = link_to ( " atlas-nodes " ) ,
2025-11-17 14:22:46 -03:00
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
16 ,
2025-11-17 21:48:12 -03:00
" Control plane CPU " ,
2025-12-12 21:55:53 -03:00
node_cpu_expr ( CONTROL_ALL_REGEX ) ,
2026-04-19 14:18:41 -03:00
{ " h " : 10 , " w " : 12 , " x " : 0 , " y " : 51 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
2025-11-17 16:27:38 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 14:22:46 -03:00
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
17 ,
2025-11-17 21:48:12 -03:00
" Control plane RAM " ,
2025-12-12 21:55:53 -03:00
node_mem_expr ( CONTROL_ALL_REGEX ) ,
2026-04-19 14:18:41 -03:00
{ " h " : 10 , " w " : 12 , " x " : 12 , " y " : 51 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
2025-11-17 16:27:38 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 14:22:46 -03:00
)
)
2025-12-12 18:51:43 -03:00
panels . append (
pie_panel (
28 ,
2025-12-12 20:30:00 -03:00
" Node Pod Share " ,
2025-12-12 20:40:32 -03:00
' (sum(kube_pod_info { pod!= " " , node!= " " }) by (node) / clamp_min(sum(kube_pod_info { pod!= " " , node!= " " }), 1)) * 100 ' ,
2026-04-19 14:18:41 -03:00
{ " h " : 10 , " w " : 12 , " x " : 0 , " y " : 61 } ,
2025-12-12 18:51:43 -03:00
)
)
panels . append (
bargauge_panel (
29 ,
" Top Nodes by Pod Count " ,
2025-12-12 19:09:51 -03:00
' topk(12, sum(kube_pod_info { pod!= " " , node!= " " }) by (node)) ' ,
2026-04-19 14:18:41 -03:00
{ " h " : 10 , " w " : 12 , " x " : 12 , " y " : 61 } ,
2025-12-12 18:51:43 -03:00
unit = " none " ,
2025-12-12 18:56:13 -03:00
limit = 12 ,
2025-12-12 20:20:13 -03:00
decimals = 0 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 75 } ,
{ " color " : " red " , " value " : 100 } ,
] ,
} ,
2025-12-12 20:30:00 -03:00
instant = True ,
2025-12-12 18:51:43 -03:00
)
)
2025-11-17 14:22:46 -03:00
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
18 ,
2025-12-02 14:41:39 -03:00
" Cluster Ingress Throughput " ,
2025-11-17 16:27:38 -03:00
NET_INGRESS_EXPR ,
2026-04-19 14:18:41 -03:00
{ " h " : 7 , " w " : 8 , " x " : 0 , " y " : 32 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-18 14:08:33 -03:00
legend = " Ingress (Traefik) " ,
2025-11-17 16:27:38 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
links = link_to ( " atlas-network " ) ,
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
19 ,
2025-12-02 14:41:39 -03:00
" Cluster Egress Throughput " ,
2025-11-17 16:27:38 -03:00
NET_EGRESS_EXPR ,
2026-04-19 14:18:41 -03:00
{ " h " : 7 , " w " : 8 , " x " : 8 , " y " : 32 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-18 14:08:33 -03:00
legend = " Egress (Traefik) " ,
2025-11-17 16:27:38 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
links = link_to ( " atlas-network " ) ,
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
20 ,
2025-12-02 14:41:39 -03:00
" Intra-Cluster Throughput " ,
2025-11-18 14:08:33 -03:00
NET_INTERNAL_EXPR ,
2026-04-19 14:18:41 -03:00
{ " h " : 7 , " w " : 8 , " x " : 16 , " y " : 32 } ,
2025-11-18 14:08:33 -03:00
unit = " Bps " ,
legend = " Internal traffic " ,
legend_display = " list " ,
legend_placement = " bottom " ,
links = link_to ( " atlas-network " ) ,
)
)
panels . append (
timeseries_panel (
21 ,
2025-12-02 14:41:39 -03:00
" Root Filesystem Usage " ,
2025-11-17 14:22:46 -03:00
root_usage_expr ( ) ,
2026-04-19 14:18:41 -03:00
{ " h " : 16 , " w " : 12 , " x " : 0 , " y " : 71 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 16:27:38 -03:00
time_from = " 30d " ,
links = link_to ( " atlas-storage " ) ,
2025-11-17 14:22:46 -03:00
)
)
panels . append (
2026-04-12 04:26:52 -03:00
timeseries_panel (
2025-12-02 13:16:00 -03:00
22 ,
2026-04-11 11:54:43 -03:00
" Nodes Closest to Full Astraios Disks " ,
2026-04-12 04:26:52 -03:00
astraios_usage_expr ( ) ,
2026-04-19 14:18:41 -03:00
{ " h " : 16 , " w " : 12 , " x " : 12 , " y " : 71 } ,
2025-12-02 13:16:00 -03:00
unit = " percent " ,
2026-04-12 04:26:52 -03:00
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
time_from = " 1w " ,
2025-12-02 13:16:00 -03:00
links = link_to ( " atlas-storage " ) ,
)
2025-11-17 14:22:46 -03:00
)
return {
" uid " : " atlas-overview " ,
" title " : " Atlas Overview " ,
2025-11-17 16:27:38 -03:00
" folderUid " : PUBLIC_FOLDER ,
2025-11-17 14:22:46 -03:00
" editable " : False ,
2025-11-17 16:27:38 -03:00
" annotations " : { " list " : [ ] } ,
2025-11-17 14:22:46 -03:00
" panels " : panels ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " overview " ] ,
2026-01-01 14:44:33 -03:00
" templating " : {
" list " : [
namespace_scope_variable ( " namespace_scope_cpu " , " CPU namespace filter " ) ,
namespace_scope_variable ( " namespace_scope_gpu " , " GPU namespace filter " ) ,
namespace_scope_variable ( " namespace_scope_ram " , " RAM namespace filter " ) ,
]
} ,
2025-12-02 14:41:39 -03:00
" time " : { " from " : " now-1h " , " to " : " now " } ,
" refresh " : " 1m " ,
2026-04-19 14:18:41 -03:00
" links " : [
{
" title " : " Atlas Testing (Internal) " ,
" url " : " /d/atlas-jobs " ,
" targetBlank " : False ,
}
] ,
2025-11-17 14:22:46 -03:00
}
def build_pods_dashboard ( ) :
panels = [ ]
panels . append (
2025-11-17 16:27:38 -03:00
stat_panel (
2025-11-17 14:22:46 -03:00
1 ,
2025-12-02 14:41:39 -03:00
" Problem Pods " ,
2025-11-17 16:27:38 -03:00
PROBLEM_PODS_EXPR ,
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
2 ,
" CrashLoop / ImagePull " ,
CRASHLOOP_EXPR ,
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
3 ,
2025-12-02 14:41:39 -03:00
" Stuck Terminating (>10m) " ,
2025-11-17 16:27:38 -03:00
STUCK_TERMINATING_EXPR ,
{ " h " : 4 , " w " : 6 , " x " : 12 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
4 ,
2025-12-02 14:41:39 -03:00
" Control Plane Workloads " ,
2025-11-17 16:27:38 -03:00
f ' sum(kube_pod_info {{ node=~ " { CONTROL_REGEX } " ,namespace!~ " { CP_ALLOWED_NS } " }} ) ' ,
{ " h " : 4 , " w " : 6 , " x " : 18 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
table_panel (
5 ,
2025-12-02 14:41:39 -03:00
" Pods Not Running " ,
2025-11-17 16:27:38 -03:00
PROBLEM_TABLE_EXPR ,
{ " h " : 10 , " w " : 24 , " x " : 0 , " y " : 4 } ,
2025-11-17 14:22:46 -03:00
unit = " s " ,
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
panels . append (
table_panel (
2025-11-17 16:27:38 -03:00
6 ,
2025-11-17 14:22:46 -03:00
" CrashLoop / ImagePull " ,
2025-11-17 16:27:38 -03:00
CRASHLOOP_TABLE_EXPR ,
{ " h " : 10 , " w " : 24 , " x " : 0 , " y " : 14 } ,
2025-11-17 14:22:46 -03:00
unit = " s " ,
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
panels . append (
table_panel (
2025-11-17 16:27:38 -03:00
7 ,
" Terminating >10m " ,
STUCK_TABLE_EXPR ,
{ " h " : 10 , " w " : 24 , " x " : 0 , " y " : 24 } ,
2025-11-17 14:22:46 -03:00
unit = " s " ,
transformations = [
2025-11-17 16:27:38 -03:00
{ " id " : " labelsToFields " , " options " : { } } ,
2025-11-17 14:22:46 -03:00
{ " id " : " filterByValue " , " options " : { " match " : " Value " , " operator " : " gt " , " value " : 600 } } ,
] ,
)
)
2025-12-12 18:32:45 -03:00
panels . append (
pie_panel (
8 ,
2025-12-12 20:30:00 -03:00
" Node Pod Share " ,
2025-12-12 20:40:32 -03:00
' (sum(kube_pod_info { pod!= " " , node!= " " }) by (node) / clamp_min(sum(kube_pod_info { pod!= " " , node!= " " }), 1)) * 100 ' ,
2025-12-12 18:32:45 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 34 } ,
)
)
2025-12-12 18:45:29 -03:00
panels . append (
bargauge_panel (
9 ,
" Top Nodes by Pod Count " ,
2025-12-12 19:09:51 -03:00
' topk(12, sum(kube_pod_info { pod!= " " , node!= " " }) by (node)) ' ,
2025-12-12 18:45:29 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 34 } ,
unit = " none " ,
2025-12-12 18:56:13 -03:00
limit = 12 ,
2025-12-12 20:20:13 -03:00
decimals = 0 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 75 } ,
{ " color " : " red " , " value " : 100 } ,
] ,
} ,
2025-12-12 20:30:00 -03:00
instant = True ,
2025-12-12 18:45:29 -03:00
)
)
2025-12-13 15:51:45 -03:00
2025-12-13 16:36:25 -03:00
share_expr = (
2025-12-13 18:23:19 -03:00
' (sum by (namespace,node) (kube_pod_info { pod!= " " , node!= " " }) '
2025-12-13 17:29:55 -03:00
' / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info { pod!= " " }), 1) * 100) '
2025-12-13 16:36:25 -03:00
)
2025-12-13 19:04:22 -03:00
rank_terms = [
f " (sum by (node) (kube_node_info {{ node= \" { node } \" }} ) * 0 + { idx * 1e-3 } ) "
2025-12-13 18:23:19 -03:00
for idx , node in enumerate ( CONTROL_ALL + WORKER_NODES , start = 1 )
2025-12-13 19:04:22 -03:00
]
rank_expr = " or " . join ( rank_terms )
2025-12-13 18:23:19 -03:00
score_expr = f " { share_expr } + on(node) group_left() ( { rank_expr } ) "
2025-12-13 17:29:55 -03:00
mask_expr = (
2025-12-13 18:39:31 -03:00
f " { score_expr } == bool on(namespace) group_left() "
f " (max by (namespace) ( { score_expr } )) "
2025-12-13 17:19:03 -03:00
)
2025-12-13 03:57:20 -03:00
panels . append (
table_panel (
10 ,
2025-12-13 22:17:47 -03:00
" Namespace Plurality by Node v27 " ,
2025-12-13 17:19:03 -03:00
(
2025-12-13 17:29:55 -03:00
f " { share_expr } * on(namespace,node) group_left() "
2025-12-13 18:25:03 -03:00
f " ( { mask_expr } ) "
2025-12-13 17:19:03 -03:00
) ,
2025-12-13 03:57:20 -03:00
{ " h " : 8 , " w " : 24 , " x " : 0 , " y " : 42 } ,
unit = " percent " ,
2025-12-13 18:23:19 -03:00
transformations = [
{ " id " : " labelsToFields " , " options " : { } } ,
{ " id " : " organize " , " options " : { " excludeByName " : { " Time " : True } } } ,
2025-12-13 18:25:03 -03:00
{ " id " : " filterByValue " , " options " : { " match " : " Value " , " operator " : " gt " , " value " : 0 } } ,
2025-12-13 18:23:19 -03:00
{
" id " : " sortBy " ,
2025-12-13 22:17:47 -03:00
" options " : { " fields " : [ " Value " ] , " order " : " desc " } ,
} ,
{
" id " : " groupBy " ,
" options " : {
" fields " : {
" namespace " : {
" aggregations " : [
{ " field " : " Value " , " operation " : " max " } ,
{ " field " : " node " , " operation " : " first " } ,
]
}
} ,
" rowBy " : [ " namespace " ] ,
} ,
2025-12-13 18:23:19 -03:00
} ,
] ,
2025-12-13 04:00:57 -03:00
instant = True ,
2025-12-13 17:32:19 -03:00
options = { " showColumnFilters " : False } ,
2025-12-13 17:55:52 -03:00
filterable = False ,
2025-12-13 18:03:51 -03:00
footer = { " show " : False , " fields " : " " , " calcs " : [ ] } ,
2025-12-13 18:23:19 -03:00
format = " table " ,
2025-12-13 03:57:20 -03:00
)
)
2025-12-13 15:51:45 -03:00
2025-11-17 14:22:46 -03:00
return {
" uid " : " atlas-pods " ,
" title " : " Atlas Pods " ,
2025-11-17 16:27:38 -03:00
" folderUid " : PRIVATE_FOLDER ,
2025-11-17 14:22:46 -03:00
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " pods " ] ,
}
def build_nodes_dashboard ( ) :
panels = [ ]
2025-11-17 16:27:38 -03:00
panels . append (
stat_panel (
1 ,
2025-12-02 14:41:39 -03:00
" Worker Nodes Ready " ,
2025-11-17 16:27:38 -03:00
f ' sum(kube_node_status_condition {{ condition= " Ready " ,status= " true " ,node=~ " { WORKER_REGEX } " }} ) ' ,
{ " h " : 4 , " w " : 8 , " x " : 0 , " y " : 0 } ,
value_suffix = WORKER_SUFFIX ,
)
)
panels . append (
stat_panel (
2 ,
2025-12-02 14:41:39 -03:00
" Control Plane Ready " ,
2025-11-17 16:27:38 -03:00
f ' sum(kube_node_status_condition {{ condition= " Ready " ,status= " true " ,node=~ " { CONTROL_REGEX } " }} ) ' ,
{ " h " : 4 , " w " : 8 , " x " : 8 , " y " : 0 } ,
value_suffix = CONTROL_SUFFIX ,
)
)
panels . append (
stat_panel (
3 ,
2025-12-02 14:41:39 -03:00
" Control Plane Workloads " ,
2025-11-17 16:27:38 -03:00
f ' sum(kube_pod_info {{ node=~ " { CONTROL_REGEX } " ,namespace!~ " { CP_ALLOWED_NS } " }} ) ' ,
{ " h " : 4 , " w " : 8 , " x " : 16 , " y " : 0 } ,
)
)
2025-12-12 18:00:43 -03:00
panels . append (
stat_panel (
9 ,
" API Server 5xx rate " ,
APISERVER_5XX_RATE ,
{ " h " : 4 , " w " : 8 , " x " : 0 , " y " : 4 } ,
unit = " req/s " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 0.05 } ,
{ " color " : " orange " , " value " : 0.2 } ,
{ " color " : " red " , " value " : 0.5 } ,
] ,
} ,
decimals = 3 ,
)
)
panels . append (
stat_panel (
10 ,
" API Server P99 latency " ,
APISERVER_P99_LATENCY_MS ,
{ " h " : 4 , " w " : 8 , " x " : 8 , " y " : 4 } ,
unit = " ms " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 250 } ,
{ " color " : " orange " , " value " : 400 } ,
{ " color " : " red " , " value " : 600 } ,
] ,
} ,
decimals = 1 ,
)
)
panels . append (
stat_panel (
11 ,
" etcd P99 latency " ,
ETCD_P99_LATENCY_MS ,
{ " h " : 4 , " w " : 8 , " x " : 16 , " y " : 4 } ,
unit = " ms " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 100 } ,
{ " color " : " red " , " value " : 200 } ,
] ,
} ,
decimals = 1 ,
)
)
2025-11-17 16:27:38 -03:00
panels . append (
timeseries_panel (
4 ,
" Node CPU " ,
node_cpu_expr ( ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 24 , " x " : 0 , " y " : 8 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
5 ,
" Node RAM " ,
node_mem_expr ( ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 24 , " x " : 0 , " y " : 17 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
6 ,
2025-12-02 14:41:39 -03:00
" Control Plane (incl. titan-db) CPU " ,
2025-11-17 16:27:38 -03:00
node_cpu_expr ( CONTROL_ALL_REGEX ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 26 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
7 ,
2025-12-02 14:41:39 -03:00
" Control Plane (incl. titan-db) RAM " ,
2025-11-17 16:27:38 -03:00
node_mem_expr ( CONTROL_ALL_REGEX ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 26 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
8 ,
2025-12-02 14:41:39 -03:00
" Root Filesystem Usage " ,
2025-11-17 16:27:38 -03:00
root_usage_expr ( ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 24 , " x " : 0 , " y " : 35 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
2026-04-11 11:54:43 -03:00
time_from = " 30d " ,
)
)
panels . append (
timeseries_panel (
9 ,
" Astraios Usage " ,
astraios_usage_expr ( ) ,
{ " h " : 9 , " w " : 24 , " x " : 0 , " y " : 44 } ,
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 16:27:38 -03:00
time_from = " 30d " ,
)
)
2025-11-17 14:22:46 -03:00
return {
" uid " : " atlas-nodes " ,
" title " : " Atlas Nodes " ,
2025-11-17 16:27:38 -03:00
" folderUid " : PRIVATE_FOLDER ,
2025-11-17 14:22:46 -03:00
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " nodes " ] ,
}
def build_storage_dashboard ( ) :
panels = [ ]
2025-11-17 16:27:38 -03:00
panels . append (
stat_panel (
1 ,
2025-12-02 14:41:39 -03:00
" Astreae Usage " ,
2025-11-17 16:27:38 -03:00
astreae_usage_expr ( " /mnt/astreae " ) ,
{ " h " : 5 , " w " : 6 , " x " : 0 , " y " : 0 } ,
unit = " percent " ,
thresholds = PERCENT_THRESHOLDS ,
)
)
panels . append (
stat_panel (
2 ,
2025-12-02 14:41:39 -03:00
" Asteria Usage " ,
2025-11-17 16:27:38 -03:00
astreae_usage_expr ( " /mnt/asteria " ) ,
{ " h " : 5 , " w " : 6 , " x " : 6 , " y " : 0 } ,
unit = " percent " ,
thresholds = PERCENT_THRESHOLDS ,
)
)
panels . append (
stat_panel (
3 ,
2025-12-02 14:41:39 -03:00
" Astreae Free " ,
2025-11-17 16:27:38 -03:00
astreae_free_expr ( " /mnt/astreae " ) ,
{ " h " : 5 , " w " : 6 , " x " : 12 , " y " : 0 } ,
2025-11-17 18:55:11 -03:00
unit = " decbytes " ,
2025-11-17 16:27:38 -03:00
)
)
panels . append (
stat_panel (
4 ,
2025-12-02 14:41:39 -03:00
" Asteria Free " ,
2025-11-17 16:27:38 -03:00
astreae_free_expr ( " /mnt/asteria " ) ,
{ " h " : 5 , " w " : 6 , " x " : 18 , " y " : 0 } ,
2025-11-17 18:55:11 -03:00
unit = " decbytes " ,
2025-11-17 16:27:38 -03:00
)
)
panels . append (
timeseries_panel (
5 ,
2025-12-02 14:41:39 -03:00
" Astreae Per-Node Usage " ,
2025-11-17 18:55:11 -03:00
filesystem_usage_expr ( " /mnt/astreae " , LONGHORN_NODE_REGEX ) ,
2025-11-17 16:27:38 -03:00
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 5 } ,
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
time_from = " 30d " ,
)
)
panels . append (
timeseries_panel (
6 ,
2025-12-02 14:41:39 -03:00
" Asteria Per-Node Usage " ,
2025-11-17 18:55:11 -03:00
filesystem_usage_expr ( " /mnt/asteria " , LONGHORN_NODE_REGEX ) ,
2025-11-17 16:27:38 -03:00
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 5 } ,
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
time_from = " 30d " ,
)
)
panels . append (
timeseries_panel (
7 ,
2025-12-02 14:41:39 -03:00
" Astreae Usage History " ,
2025-11-17 16:27:38 -03:00
astreae_usage_expr ( " /mnt/astreae " ) ,
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 14 } ,
unit = " percent " ,
time_from = " 90d " ,
)
)
panels . append (
timeseries_panel (
8 ,
2025-12-02 14:41:39 -03:00
" Asteria Usage History " ,
2025-11-17 16:27:38 -03:00
astreae_usage_expr ( " /mnt/asteria " ) ,
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 14 } ,
unit = " percent " ,
time_from = " 90d " ,
)
)
2026-01-11 23:46:24 -03:00
panels . append (
stat_panel (
30 ,
" Maintenance Sweepers Ready " ,
' kube_daemonset_status_number_ready { namespace= " maintenance " ,daemonset= " node-image-sweeper " } / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled { namespace= " maintenance " ,daemonset= " node-image-sweeper " } * 100 ' ,
{ " h " : 4 , " w " : 12 , " x " : 0 , " y " : 44 } ,
unit = " percent " ,
thresholds = PERCENT_THRESHOLDS ,
)
)
panels . append (
stat_panel (
31 ,
" Maintenance Cron Freshness (s) " ,
2026-01-14 06:41:34 -03:00
' time() - max by (cronjob) (kube_cronjob_status_last_successful_time { namespace= " maintenance " ,cronjob= " image-sweeper " }) ' ,
2026-01-11 23:46:24 -03:00
{ " h " : 4 , " w " : 12 , " x " : 12 , " y " : 44 } ,
unit = " s " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 3600 } ,
{ " color " : " red " , " value " : 10800 } ,
] ,
} ,
)
)
2025-11-17 14:22:46 -03:00
return {
" uid " : " atlas-storage " ,
" title " : " Atlas Storage " ,
2025-11-17 16:27:38 -03:00
" folderUid " : PRIVATE_FOLDER ,
2025-11-17 14:22:46 -03:00
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " storage " ] ,
}
2025-11-17 16:27:38 -03:00
def build_network_dashboard ( ) :
panels = [ ]
panels . append (
2025-12-02 13:16:00 -03:00
stat_panel (
1 ,
2025-12-12 18:00:43 -03:00
" Ingress Success Rate (5m) " ,
TRAEFIK_SLI_5M ,
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 0 } ,
unit = " percentunit " ,
decimals = 2 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " orange " , " value " : 0.995 } ,
{ " color " : " yellow " , " value " : 0.999 } ,
{ " color " : " green " , " value " : 0.9995 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
2 ,
" Error Budget Burn (1h) " ,
traefik_burn ( " 1h " ) ,
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 4 } ,
] ,
} ,
decimals = 2 ,
)
)
panels . append (
stat_panel (
3 ,
" Error Budget Burn (6h) " ,
traefik_burn ( " 6h " ) ,
{ " h " : 4 , " w " : 6 , " x " : 12 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 4 } ,
] ,
} ,
decimals = 2 ,
)
)
panels . append (
stat_panel (
4 ,
" Edge P99 Latency (ms) " ,
TRAEFIK_P99_LATENCY_MS ,
{ " h " : 4 , " w " : 6 , " x " : 18 , " y " : 0 } ,
unit = " ms " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 200 } ,
{ " color " : " orange " , " value " : 350 } ,
{ " color " : " red " , " value " : 500 } ,
] ,
} ,
decimals = 1 ,
)
)
panels . append (
stat_panel (
5 ,
2025-12-02 14:41:39 -03:00
" Ingress Traffic " ,
2025-12-02 13:16:00 -03:00
NET_INGRESS_EXPR ,
2025-12-12 18:00:43 -03:00
{ " h " : 4 , " w " : 8 , " x " : 0 , " y " : 4 } ,
2025-12-02 13:16:00 -03:00
unit = " Bps " ,
)
2025-11-17 16:27:38 -03:00
)
panels . append (
2025-12-02 13:16:00 -03:00
stat_panel (
2025-12-12 18:00:43 -03:00
6 ,
2025-12-02 14:41:39 -03:00
" Egress Traffic " ,
2025-12-02 13:16:00 -03:00
NET_EGRESS_EXPR ,
2025-12-12 18:00:43 -03:00
{ " h " : 4 , " w " : 8 , " x " : 8 , " y " : 4 } ,
2025-12-02 13:16:00 -03:00
unit = " Bps " ,
)
2025-11-17 16:27:38 -03:00
)
2025-11-18 14:08:33 -03:00
panels . append (
2025-12-02 13:16:00 -03:00
stat_panel (
2025-12-12 18:00:43 -03:00
7 ,
2025-12-02 14:41:39 -03:00
" Intra-Cluster Traffic " ,
2025-12-02 13:16:00 -03:00
NET_INTERNAL_EXPR ,
2025-12-12 18:00:43 -03:00
{ " h " : 4 , " w " : 8 , " x " : 16 , " y " : 4 } ,
2025-12-02 13:16:00 -03:00
unit = " Bps " ,
)
2025-11-18 14:08:33 -03:00
)
2025-11-17 16:27:38 -03:00
panels . append (
timeseries_panel (
2025-12-12 18:00:43 -03:00
8 ,
2025-12-02 14:41:39 -03:00
" Per-Node Throughput " ,
2025-12-02 13:16:00 -03:00
f ' avg by (node) (( { NET_NODE_TX_PHYS } + { NET_NODE_RX_PHYS } ) * on(instance) group_left(node) { NODE_INFO } ) ' ,
2025-11-18 14:08:33 -03:00
{ " h " : 8 , " w " : 24 , " x " : 0 , " y " : 8 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-17 16:27:38 -03:00
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
table_panel (
2025-12-12 18:00:43 -03:00
9 ,
2025-12-02 14:41:39 -03:00
" Top Namespaces " ,
2025-11-17 16:27:38 -03:00
' topk(10, sum(rate(container_network_transmit_bytes_total { namespace!= " " }[5m]) '
' + rate(container_network_receive_bytes_total { namespace!= " " }[5m])) by (namespace)) ' ,
2025-11-18 14:08:33 -03:00
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 16 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-17 16:27:38 -03:00
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
panels . append (
table_panel (
2025-12-12 18:00:43 -03:00
10 ,
2025-12-02 14:41:39 -03:00
" Top Pods " ,
2025-11-17 16:27:38 -03:00
' topk(10, sum(rate(container_network_transmit_bytes_total { pod!= " " }[5m]) '
' + rate(container_network_receive_bytes_total { pod!= " " }[5m])) by (namespace,pod)) ' ,
2025-11-18 14:08:33 -03:00
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 16 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-17 16:27:38 -03:00
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
panels . append (
timeseries_panel (
2025-12-12 18:00:43 -03:00
11 ,
2025-12-02 14:41:39 -03:00
" Traefik Routers (req/s) " ,
2025-11-17 18:55:11 -03:00
f " topk(10, { TRAEFIK_ROUTER_EXPR } ) " ,
2025-11-18 14:08:33 -03:00
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 25 } ,
2025-11-17 16:27:38 -03:00
unit = " req/s " ,
legend = " {{ router}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
2025-12-12 18:00:43 -03:00
12 ,
2025-12-02 14:41:39 -03:00
" Traefik Entrypoints (req/s) " ,
2025-11-17 16:27:38 -03:00
' sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m])) ' ,
2025-11-18 14:08:33 -03:00
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 25 } ,
2025-11-17 16:27:38 -03:00
unit = " req/s " ,
legend = " {{ entrypoint}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
return {
" uid " : " atlas-network " ,
" title " : " Atlas Network " ,
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " network " ] ,
}
2026-01-05 21:55:59 -03:00
def build_mail_dashboard ( ) :
panels = [ ]
bounce_rate_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 5 } ,
{ " color " : " orange " , " value " : 8 } ,
{ " color " : " red " , " value " : 10 } ,
] ,
}
2026-01-06 14:38:10 -03:00
limit_thresholds = {
2026-01-05 21:55:59 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
2026-01-06 14:38:10 -03:00
{ " color " : " yellow " , " value " : 70 } ,
{ " color " : " orange " , " value " : 85 } ,
{ " color " : " red " , " value " : 95 } ,
2026-01-05 21:55:59 -03:00
] ,
}
2026-01-06 14:38:10 -03:00
success_thresholds = {
2026-01-05 21:55:59 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
2026-01-06 14:38:10 -03:00
{ " color " : " orange " , " value " : 90 } ,
{ " color " : " yellow " , " value " : 95 } ,
{ " color " : " green " , " value " : 98 } ,
2026-01-06 02:06:20 -03:00
] ,
}
2026-01-05 21:55:59 -03:00
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
1 ,
2026-01-05 21:55:59 -03:00
" Sent (1d) " ,
2026-01-06 02:06:20 -03:00
' max(postmark_outbound_sent { window= " 1d " }) ' ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 0 } ,
2026-01-05 21:55:59 -03:00
decimals = 0 ,
)
)
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
2 ,
2026-01-05 21:55:59 -03:00
" Sent (7d) " ,
2026-01-06 02:06:20 -03:00
' max(postmark_outbound_sent { window= " 7d " }) ' ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 0 } ,
2026-01-05 21:55:59 -03:00
decimals = 0 ,
)
)
2026-01-06 14:38:10 -03:00
panels . append (
{
" id " : 3 ,
" type " : " stat " ,
" title " : " Mail Bounces (1d) " ,
" datasource " : PROM_DS ,
" gridPos " : { " h " : 4 , " w " : 6 , " x " : 12 , " y " : 0 } ,
" targets " : [
{
" expr " : ' max(postmark_outbound_bounce_rate { window= " 1d " }) ' ,
" refId " : " A " ,
" legendFormat " : " Rate " ,
} ,
{
" expr " : ' max(postmark_outbound_bounced { window= " 1d " }) ' ,
" refId " : " B " ,
" legendFormat " : " Count " ,
} ,
] ,
" fieldConfig " : {
" defaults " : {
" color " : { " mode " : " thresholds " } ,
" custom " : { " displayMode " : " auto " } ,
" thresholds " : bounce_rate_thresholds ,
" unit " : " none " ,
} ,
" overrides " : [
{
" matcher " : { " id " : " byName " , " options " : " Rate " } ,
" properties " : [ { " id " : " unit " , " value " : " percent " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : " Count " } ,
" properties " : [ { " id " : " unit " , " value " : " none " } ] ,
} ,
] ,
} ,
" options " : {
" colorMode " : " value " ,
" graphMode " : " area " ,
" justifyMode " : " center " ,
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" textMode " : " name_and_value " ,
} ,
}
)
2026-01-05 21:55:59 -03:00
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
4 ,
" Success Rate (1d) " ,
' clamp_min(100 - max(postmark_outbound_bounce_rate { window= " 1d " }), 0) ' ,
{ " h " : 4 , " w " : 6 , " x " : 18 , " y " : 0 } ,
unit = " percent " ,
thresholds = success_thresholds ,
decimals = 1 ,
)
)
panels . append (
stat_panel (
5 ,
2026-01-06 02:06:20 -03:00
" Limit Used (30d) " ,
" max(postmark_sending_limit_used_percent) " ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 4 } ,
2026-01-06 02:06:20 -03:00
thresholds = limit_thresholds ,
unit = " percent " ,
decimals = 1 ,
)
)
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
6 ,
2026-01-06 02:06:20 -03:00
" Send Limit (30d) " ,
" max(postmark_sending_limit) " ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 4 } ,
2026-01-05 21:55:59 -03:00
decimals = 0 ,
)
)
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
7 ,
2026-01-05 21:55:59 -03:00
" Last Success " ,
2026-01-06 02:06:20 -03:00
" max(postmark_last_success_timestamp_seconds) " ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 12 , " y " : 4 } ,
2026-01-05 21:55:59 -03:00
unit = " dateTimeAsIso " ,
decimals = 0 ,
)
)
2026-01-06 02:06:20 -03:00
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
8 ,
2026-01-06 02:06:20 -03:00
" Exporter Errors " ,
" sum(postmark_request_errors_total) " ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 18 , " y " : 4 } ,
2026-01-06 02:06:20 -03:00
decimals = 0 ,
)
)
2026-01-05 21:55:59 -03:00
panels . append (
timeseries_panel (
2026-01-06 02:06:20 -03:00
13 ,
2026-01-05 21:55:59 -03:00
" Bounce Rate (1d vs 7d) " ,
2026-01-06 02:06:20 -03:00
" max by (window) (postmark_outbound_bounce_rate) " ,
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 12 } ,
2026-01-05 21:55:59 -03:00
unit = " percent " ,
legend = " {{ window}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
2026-01-06 02:06:20 -03:00
14 ,
2026-01-05 21:55:59 -03:00
" Bounced (1d vs 7d) " ,
2026-01-06 02:06:20 -03:00
" max by (window) (postmark_outbound_bounced) " ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 12 } ,
2026-01-05 21:55:59 -03:00
unit = " none " ,
legend = " {{ window}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
2026-01-06 02:06:20 -03:00
15 ,
2026-01-05 21:55:59 -03:00
" Sent (1d vs 7d) " ,
2026-01-06 02:06:20 -03:00
" max by (window) (postmark_outbound_sent) " ,
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 20 } ,
2026-01-05 21:55:59 -03:00
unit = " none " ,
legend = " {{ window}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
2026-01-06 02:06:20 -03:00
16 ,
2026-01-05 21:55:59 -03:00
" Exporter Errors " ,
2026-01-06 02:06:20 -03:00
" sum(postmark_request_errors_total) " ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 20 } ,
2026-01-05 21:55:59 -03:00
unit = " none " ,
)
)
return {
" uid " : " atlas-mail " ,
2026-04-19 14:18:41 -03:00
" title " : " Atlas Mail " ,
2026-04-12 20:05:39 -03:00
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
2026-04-19 14:18:41 -03:00
" time " : { " from " : " now-30d " , " to " : " now " } ,
2026-04-12 20:05:39 -03:00
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
2026-04-19 14:18:41 -03:00
" tags " : [ " atlas " , " mail " ] ,
2026-04-12 20:05:39 -03:00
}
2026-04-19 14:18:41 -03:00
def build_jobs_dashboard ( ) :
2026-04-12 20:05:39 -03:00
panels = [ ]
2026-04-19 23:22:34 -03:00
suite_var = " $ { suite:regex} "
2026-04-20 08:35:05 -03:00
test_var = " $ { test:regex} "
2026-04-21 09:35:43 -03:00
branch_var = " $ { branch:regex} "
2026-04-18 17:47:06 -03:00
success = PLATFORM_TEST_SUCCESS_STATUS
2026-04-19 14:18:41 -03:00
exported = PLATFORM_TEST_EXPORT_FILTER
runs_selector = f ' suite=~ " { suite_var } " , { exported } '
runs_success_selector = f ' { runs_selector } ,status=~ " { success } " '
runs_failure_selector = f ' { runs_selector } ,status!~ " { success } " '
checks_selector = f ' __name__=~ " .*_quality_gate_checks_total " ,suite=~ " { suite_var } " , { exported } '
tests_selector = f ' __name__=~ " .*_quality_gate_tests_total " ,suite=~ " { suite_var } " , { exported } '
coverage_metric_selector = f ' __name__=~ " .*_quality_gate_coverage_percent " ,suite=~ " { suite_var } " , { exported } '
workspace_coverage_selector = f ' suite=~ " { suite_var } " , { exported } '
smell_selector = f ' suite=~ " { suite_var } " , { exported } '
2026-04-20 09:13:34 -03:00
test_case_selector = f ' suite=~ " { suite_var } " ,test=~ " { test_var } " ,test!= " __no_test_cases__ " , { exported } '
2026-04-21 09:35:43 -03:00
build_info_selector = f ' suite=~ " { suite_var } " ,branch=~ " { branch_var } " , { exported } '
2026-04-19 14:18:41 -03:00
suite_universe = " or " . join (
f ' label_replace(vector(1), " suite " , " { suite } " , " __name__ " , " .* " ) '
for suite in PLATFORM_TEST_SUITE_NAMES
2026-04-18 17:47:06 -03:00
)
2026-04-19 14:18:41 -03:00
runs_24h = f ' (sum(increase(platform_quality_gate_runs_total {{ { runs_selector } }} [24h])) or on() vector(0)) '
runs_30d = f ' (sum(increase(platform_quality_gate_runs_total {{ { runs_selector } }} [30d])) or on() vector(0)) '
2026-04-18 17:47:06 -03:00
success_24h = (
2026-04-19 14:18:41 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ { runs_success_selector } }} [24h])) or on() vector(0)) '
2026-04-18 17:47:06 -03:00
)
success_30d = (
2026-04-19 14:18:41 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ { runs_success_selector } }} [30d])) or on() vector(0)) '
2026-04-18 17:47:06 -03:00
)
failures_24h = (
2026-04-19 14:18:41 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ { runs_failure_selector } }} [24h])) or on() vector(0)) '
2026-04-18 17:47:06 -03:00
)
success_rate_24h = f " 100 * ( { success_24h } ) / clamp_min(( { runs_24h } ), 1) "
success_rate_30d = f " 100 * ( { success_30d } ) / clamp_min(( { runs_30d } ), 1) "
2026-04-19 14:18:41 -03:00
success_rate_by_suite_24h = (
f ' sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_success_selector } }} [24h]))) '
f ' / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_selector } }} [24h]))), 1)) '
)
failures_by_suite_24h = (
f ' sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_failure_selector } }} [24h])) '
)
2026-04-20 08:35:05 -03:00
success_history_by_suite = (
2026-04-20 08:07:30 -03:00
f ' 100 * (sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_success_selector } }} [$__interval])) '
f ' / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_selector } }} [$__interval]))), 1)) '
)
2026-04-18 17:47:06 -03:00
coverage_by_suite = (
2026-04-19 14:18:41 -03:00
f ' (max by (suite) ( {{ { coverage_metric_selector } }} )) '
f ' or on(suite) (max by (suite) (platform_quality_gate_workspace_line_coverage_percent {{ { workspace_coverage_selector } }} )) '
2026-04-18 17:47:06 -03:00
)
2026-04-20 13:45:01 -03:00
coverage_with_missing = (
f " ( { coverage_by_suite } ) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_selector } }} [30d])))) - 1) "
)
2026-04-18 17:47:06 -03:00
coverage_gap = f " clamp_min(95 - ( { coverage_by_suite } ), 0) "
2026-04-19 14:18:41 -03:00
smell_by_suite = f ' max by (suite) (platform_quality_gate_source_lines_over_500_total {{ { smell_selector } }} ) '
2026-04-20 13:45:01 -03:00
smell_with_missing = (
f " ( { smell_by_suite } ) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_selector } }} [30d])))) - 1) "
)
2026-04-18 17:47:06 -03:00
average_coverage = f " (avg(( { coverage_by_suite } )) or on() vector(0)) "
suites_loc_violating = f ' (sum((( { smell_by_suite } ) > bool 0)) or on() vector(0)) '
2026-04-19 14:18:41 -03:00
checks_failed_total = f ' (sum( {{ { checks_selector } ,result!~ " { success } " }} ) or on() vector(0)) '
checks_failed_tests = (
f ' (sum(count by (suite) ( {{ { checks_selector } ,check=~ " tests|unit|build " ,result!~ " { success } " }} )) or on() vector(0)) '
2026-04-18 17:47:06 -03:00
)
2026-04-19 14:18:41 -03:00
checks_failed_coverage = (
f ' (sum(count by (suite) ( {{ { checks_selector } ,check=~ " coverage " ,result!~ " { success } " }} )) or on() vector(0)) '
)
checks_failed_loc = (
f ' (sum(count by (suite) ( {{ { checks_selector } ,check=~ " loc|smell " ,result!~ " { success } " }} )) or on() vector(0)) '
)
checks_failed_docs = (
f ' (sum(count by (suite) ( {{ { checks_selector } ,check=~ " docs|naming|hygiene|lint|docs_naming " ,result!~ " { success } " }} )) or on() vector(0)) '
)
checks_failed_gate = (
f ' (sum(count by (suite) ( {{ { checks_selector } ,check=~ " gate|glue|gate_glue " ,result!~ " { success } " }} )) or on() vector(0)) '
)
checks_failed_sonarqube = (
f ' (sum(count by (suite) ( {{ { checks_selector } ,check=~ " sonarqube|sonar " ,result!~ " { success } " }} )) or on() vector(0)) '
)
checks_failed_supply_chain = (
f ' (sum(count by (suite) ( {{ { checks_selector } ,check=~ " ironbank|supply_chain|image_compliance|artifact_security " ,result!~ " { success } " }} )) or on() vector(0)) '
)
2026-04-20 08:07:30 -03:00
check_regex_tests = " tests|unit|build "
check_regex_coverage = " coverage "
check_regex_loc = " loc|smell "
check_regex_style = " docs|naming|hygiene|lint|docs_naming|style "
check_regex_gate_glue = " gate|glue|gate_glue "
check_regex_sonarqube = " sonarqube|sonar "
check_regex_supply_chain = " ironbank|supply_chain|image_compliance|artifact_security "
def _check_state_series ( regex : str , failed : bool ) - > str :
state = f ' result!~ " { success } " ' if failed else f ' result=~ " { success } " '
core = (
f ' sum by (suite) (max_over_time(( {{ { checks_selector } ,check=~ " { regex } " , { state } }} )[$__interval])) '
)
return f ' ( { core } ) or on(suite) (0 * ( { suite_universe } )) '
2026-04-19 14:18:41 -03:00
2026-04-20 08:37:26 -03:00
problematic_tests_history_core = (
2026-04-20 09:13:34 -03:00
f ' topk(12, sum by (suite, test) (increase(platform_quality_gate_test_case_result {{ suite=~ " { suite_var } " ,test!= " __no_test_cases__ " ,status= " failed " , { exported } }} [$__interval]))) '
2026-04-20 08:35:05 -03:00
)
2026-04-20 08:37:26 -03:00
problematic_tests_history = f " ( { problematic_tests_history_core } ) or on() vector(0) "
worst_test_per_suite_core = (
2026-04-20 09:13:34 -03:00
f ' topk by (suite) (1, sum by (suite, test) (increase(platform_quality_gate_test_case_result {{ suite=~ " { suite_var } " ,test!= " __no_test_cases__ " ,status= " failed " , { exported } }} [30d]))) '
2026-04-20 08:35:05 -03:00
)
2026-04-20 08:37:26 -03:00
worst_test_per_suite = f " ( { worst_test_per_suite_core } ) or on() vector(0) "
2026-04-20 08:35:05 -03:00
selected_test_pass_fail = [
{
" refId " : " A " ,
" expr " : f ' sum by (suite) (increase(platform_quality_gate_test_case_result {{ { test_case_selector } ,status= " passed " }} [$__interval])) or on() vector(0) ' ,
" legendFormat " : " passed · {{ suite}} " ,
} ,
{
" refId " : " B " ,
" expr " : f ' sum by (suite) (increase(platform_quality_gate_test_case_result {{ { test_case_selector } ,status= " failed " }} [$__interval])) or on() vector(0) ' ,
" legendFormat " : " failed · {{ suite}} " ,
} ,
{
" refId " : " C " ,
" expr " : f ' sum by (suite) (increase(platform_quality_gate_test_case_result {{ { test_case_selector } ,status= " skipped " }} [$__interval])) or on() vector(0) ' ,
" legendFormat " : " skipped · {{ suite}} " ,
} ,
]
2026-04-21 09:35:43 -03:00
recent_branch_evidence = (
f ' sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info {{ { build_info_selector } }} [30d]))) '
)
non_primary_branch_evidence = (
f ' sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info {{ { build_info_selector } ,branch!~ " main|master|origin/main|origin/master|unknown " }} [30d]))) '
)
2026-04-20 08:35:05 -03:00
2026-04-19 14:18:41 -03:00
missing_tests_by_suite = (
2026-04-20 08:07:30 -03:00
f ' (( { suite_universe } ) unless on(suite) count by (suite) ( {{ __name__=~ " .*_quality_gate_tests_total " , { exported } }} )) '
2026-04-19 14:18:41 -03:00
)
missing_checks_by_suite = (
2026-04-20 08:07:30 -03:00
f ' (( { suite_universe } ) unless on(suite) count by (suite) ( {{ __name__=~ " .*_quality_gate_checks_total " , { exported } }} )) '
2026-04-19 14:18:41 -03:00
)
missing_coverage_by_suite = (
2026-04-20 08:07:30 -03:00
f ' (( { suite_universe } ) unless on(suite) count by (suite) (platform_quality_gate_workspace_line_coverage_percent {{ { exported } }} )) '
2026-04-19 14:18:41 -03:00
)
missing_loc_by_suite = (
2026-04-20 08:07:30 -03:00
f ' (( { suite_universe } ) unless on(suite) count by (suite) (platform_quality_gate_source_lines_over_500_total {{ { exported } }} )) '
2026-04-18 17:47:06 -03:00
)
2026-04-20 13:45:01 -03:00
missing_test_case_by_suite = (
f ' (( { suite_universe } ) unless on(suite) count by (suite) (platform_quality_gate_test_case_result {{ { exported } }} )) '
)
2026-04-18 17:47:06 -03:00
2026-04-19 14:18:41 -03:00
success_thresholds = {
2026-04-12 20:05:39 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " orange " , " value " : 80 } ,
{ " color " : " yellow " , " value " : 95 } ,
{ " color " : " green " , " value " : 99 } ,
] ,
}
failures_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 3 } ,
{ " color " : " red " , " value " : 5 } ,
] ,
}
2026-04-18 17:47:06 -03:00
coverage_gap_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 5 } ,
{ " color " : " red " , " value " : 10 } ,
] ,
}
2026-04-12 22:58:21 -03:00
smell_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " green " , " value " : 0 } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 3 } ,
{ " color " : " red " , " value " : 5 } ,
] ,
}
2026-04-19 14:18:41 -03:00
missing_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
}
2026-04-12 20:05:39 -03:00
2026-04-18 17:47:06 -03:00
panels . append (
stat_panel (
2 ,
" Success Rate (24h) " ,
success_rate_24h ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 4 , " x " : 0 , " y " : 0 } ,
2026-04-18 17:47:06 -03:00
unit = " percent " ,
decimals = 2 ,
instant = True ,
2026-04-19 14:18:41 -03:00
thresholds = success_thresholds ,
2026-04-18 17:47:06 -03:00
)
2026-03-31 14:51:49 -03:00
)
2026-04-18 17:47:06 -03:00
panels . append (
stat_panel (
3 ,
" Success Rate (30d) " ,
success_rate_30d ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 4 , " x " : 4 , " y " : 0 } ,
2026-04-18 17:47:06 -03:00
unit = " percent " ,
decimals = 2 ,
instant = True ,
2026-04-19 14:18:41 -03:00
thresholds = success_thresholds ,
2026-04-18 17:47:06 -03:00
)
2026-03-31 14:51:49 -03:00
)
2026-04-12 20:05:39 -03:00
panels . append (
2026-04-18 17:47:06 -03:00
stat_panel (
2026-04-12 20:05:39 -03:00
4 ,
2026-04-18 17:47:06 -03:00
" Failures (24h) " ,
failures_24h ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 4 , " x " : 8 , " y " : 0 } ,
2026-04-18 17:47:06 -03:00
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
panels . append (
stat_panel (
5 ,
" Runs (24h) " ,
runs_24h ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 4 , " x " : 12 , " y " : 0 } ,
2026-04-18 17:47:06 -03:00
unit = " none " ,
instant = True ,
thresholds = {
" mode " : " absolute " ,
" steps " : [ { " color " : " red " , " value " : None } , { " color " : " green " , " value " : 1 } ] ,
} ,
)
)
panels . append (
stat_panel (
6 ,
" Avg Coverage ( % ) " ,
average_coverage ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 4 , " x " : 16 , " y " : 0 } ,
2026-04-18 17:47:06 -03:00
unit = " percent " ,
decimals = 2 ,
instant = True ,
2026-04-19 14:18:41 -03:00
thresholds = success_thresholds ,
2026-04-18 17:47:06 -03:00
)
)
panels . append (
stat_panel (
7 ,
" Suites with LOC >500 " ,
suites_loc_violating ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 4 , " x " : 20 , " y " : 0 } ,
2026-04-18 17:47:06 -03:00
unit = " none " ,
instant = True ,
thresholds = smell_thresholds ,
)
)
2026-04-19 14:18:41 -03:00
panels . append (
stat_panel (
19 ,
" Failing Tests " ,
checks_failed_tests ,
{ " h " : 4 , " w " : 3 , " x " : 0 , " y " : 5 } ,
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
panels . append (
stat_panel (
20 ,
" Failing Coverage " ,
checks_failed_coverage ,
{ " h " : 4 , " w " : 3 , " x " : 3 , " y " : 5 } ,
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
panels . append (
stat_panel (
21 ,
" Failing LOC " ,
checks_failed_loc ,
{ " h " : 4 , " w " : 3 , " x " : 6 , " y " : 5 } ,
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
panels . append (
stat_panel (
22 ,
" Failing Docs/Naming " ,
checks_failed_docs ,
{ " h " : 4 , " w " : 3 , " x " : 9 , " y " : 5 } ,
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
panels . append (
stat_panel (
23 ,
" Failing Gate/Glue " ,
checks_failed_gate ,
{ " h " : 4 , " w " : 3 , " x " : 12 , " y " : 5 } ,
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
panels . append (
stat_panel (
24 ,
" Failing SonarQube " ,
checks_failed_sonarqube ,
{ " h " : 4 , " w " : 3 , " x " : 15 , " y " : 5 } ,
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
panels . append (
stat_panel (
25 ,
" Failing Supply Chain " ,
checks_failed_supply_chain ,
{ " h " : 4 , " w " : 3 , " x " : 18 , " y " : 5 } ,
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
panels . append (
stat_panel (
26 ,
" Total Failing Checks " ,
checks_failed_total ,
{ " h " : 4 , " w " : 3 , " x " : 21 , " y " : 5 } ,
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
2026-04-18 17:47:06 -03:00
panels . append (
bargauge_panel (
8 ,
" Failures by Suite (24h) " ,
2026-04-19 14:18:41 -03:00
failures_by_suite_24h ,
{ " h " : 8 , " w " : 8 , " x " : 0 , " y " : 9 } ,
2026-04-12 20:05:39 -03:00
unit = " none " ,
instant = True ,
legend = " {{ suite}} " ,
thresholds = failures_thresholds ,
)
)
panels . append (
bargauge_panel (
2026-04-18 17:47:06 -03:00
9 ,
" Success Rate by Suite (24h) " ,
success_rate_by_suite_24h ,
2026-04-19 14:18:41 -03:00
{ " h " : 8 , " w " : 8 , " x " : 8 , " y " : 9 } ,
2026-04-12 20:05:39 -03:00
unit = " percent " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " asc " ,
2026-04-19 14:18:41 -03:00
thresholds = success_thresholds ,
2026-04-12 20:05:39 -03:00
decimals = 2 ,
)
2026-03-31 14:51:49 -03:00
)
2026-04-18 17:47:06 -03:00
coverage_gap_panel = bargauge_panel (
10 ,
" Coverage Gap to 95 % by Suite " ,
coverage_gap ,
2026-04-19 14:18:41 -03:00
{ " h " : 8 , " w " : 8 , " x " : 16 , " y " : 9 } ,
2026-04-18 17:47:06 -03:00
unit = " percent " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
thresholds = coverage_gap_thresholds ,
decimals = 2 ,
)
coverage_gap_panel [ " description " ] = " Gap from the 95 % target. 0 means the suite is at or above target. "
panels . append ( coverage_gap_panel )
2026-04-19 14:18:41 -03:00
2026-04-18 17:47:06 -03:00
history_panel = timeseries_panel (
11 ,
" Success History by Suite " ,
success_history_by_suite ,
2026-04-19 14:18:41 -03:00
{ " h " : 8 , " w " : 24 , " x " : 0 , " y " : 17 } ,
2026-04-08 23:33:17 -03:00
unit = " percent " ,
2026-04-18 17:47:06 -03:00
legend = " {{ suite}} " ,
2026-04-08 23:33:17 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
)
2026-04-18 17:47:06 -03:00
history_panel [ " fieldConfig " ] [ " defaults " ] [ " min " ] = 0
history_panel [ " fieldConfig " ] [ " defaults " ] [ " max " ] = 100
history_panel [ " fieldConfig " ] [ " defaults " ] [ " custom " ] = {
2026-04-09 16:35:14 -03:00
" drawStyle " : " line " ,
" lineInterpolation " : " linear " ,
" lineWidth " : 2 ,
2026-04-18 17:47:06 -03:00
" fillOpacity " : 8 ,
2026-04-09 16:35:14 -03:00
" showPoints " : " always " ,
2026-04-18 17:47:06 -03:00
" pointSize " : 3 ,
2026-04-09 16:35:14 -03:00
" spanNulls " : True ,
}
2026-04-18 17:47:06 -03:00
panels . append ( history_panel )
2026-04-19 14:18:41 -03:00
2026-04-18 17:47:06 -03:00
panels . append (
timeseries_panel (
12 ,
" Run Outcomes (Selected Scope) " ,
None ,
2026-04-19 14:18:41 -03:00
{ " h " : 8 , " w " : 8 , " x " : 0 , " y " : 25 } ,
2026-04-18 17:47:06 -03:00
unit = " none " ,
targets = [
{
" refId " : " A " ,
2026-04-19 14:18:41 -03:00
" expr " : f ' sum(increase(platform_quality_gate_runs_total {{ { runs_success_selector } }} [$__interval])) or on() vector(0) ' ,
2026-04-18 17:47:06 -03:00
" legendFormat " : " Success " ,
} ,
{
" refId " : " B " ,
2026-04-19 14:18:41 -03:00
" expr " : f ' sum(increase(platform_quality_gate_runs_total {{ { runs_failure_selector } }} [$__interval])) or on() vector(0) ' ,
2026-04-18 17:47:06 -03:00
" legendFormat " : " Failure " ,
} ,
{
" refId " : " C " ,
2026-04-19 14:18:41 -03:00
" expr " : f ' sum(increase(platform_quality_gate_runs_total {{ { runs_selector } }} [$__interval])) or on() vector(0) ' ,
2026-04-18 17:47:06 -03:00
" legendFormat " : " Total " ,
} ,
] ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " sum " ] ,
)
)
panels . append (
timeseries_panel (
13 ,
" Coverage & LOC History (Selected Scope) " ,
None ,
2026-04-19 14:18:41 -03:00
{ " h " : 8 , " w " : 8 , " x " : 8 , " y " : 25 } ,
2026-04-18 17:47:06 -03:00
unit = " none " ,
targets = [
{
" refId " : " A " ,
2026-04-19 14:18:41 -03:00
" expr " : f ' max_over_time(platform_quality_gate_workspace_line_coverage_percent {{ { workspace_coverage_selector } }} [$__interval]) ' ,
2026-04-18 17:47:06 -03:00
" legendFormat " : " {{ suite}} coverage % " ,
} ,
{
" refId " : " B " ,
2026-04-19 14:18:41 -03:00
" expr " : f ' max_over_time(platform_quality_gate_source_lines_over_500_total {{ { smell_selector } }} [$__interval]) ' ,
2026-04-18 17:47:06 -03:00
" legendFormat " : " {{ suite}} files >500 LOC " ,
} ,
] ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
2026-04-19 14:18:41 -03:00
run_mix_panel = pie_panel (
14 ,
" Run Status Mix (30d) " ,
f ' sum by (status) (increase(platform_quality_gate_runs_total {{ { runs_selector } }} [30d])) ' ,
{ " h " : 8 , " w " : 8 , " x " : 16 , " y " : 25 } ,
)
run_mix_panel [ " targets " ] [ 0 ] [ " legendFormat " ] = " {{ status}} "
run_mix_panel [ " fieldConfig " ] [ " defaults " ] [ " unit " ] = " none "
panels . append ( run_mix_panel )
2026-04-20 08:07:30 -03:00
panels . append (
timeseries_panel (
130 ,
" Fail Trend: Tests " ,
_check_state_series ( check_regex_tests , True ) ,
{ " h " : 6 , " w " : 3 , " x " : 0 , " y " : 33 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
131 ,
" Fail Trend: Coverage " ,
_check_state_series ( check_regex_coverage , True ) ,
{ " h " : 6 , " w " : 3 , " x " : 3 , " y " : 33 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
132 ,
" Fail Trend: LOC " ,
_check_state_series ( check_regex_loc , True ) ,
{ " h " : 6 , " w " : 3 , " x " : 6 , " y " : 33 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
133 ,
" Fail Trend: Style " ,
_check_state_series ( check_regex_style , True ) ,
{ " h " : 6 , " w " : 3 , " x " : 9 , " y " : 33 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
134 ,
" Fail Trend: Gate Glue " ,
_check_state_series ( check_regex_gate_glue , True ) ,
{ " h " : 6 , " w " : 3 , " x " : 12 , " y " : 33 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
135 ,
" Fail Trend: SonarQube " ,
_check_state_series ( check_regex_sonarqube , True ) ,
{ " h " : 6 , " w " : 3 , " x " : 15 , " y " : 33 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
136 ,
" Fail Trend: Supply Chain " ,
_check_state_series ( check_regex_supply_chain , True ) ,
{ " h " : 6 , " w " : 3 , " x " : 18 , " y " : 33 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
138 ,
" Pass Trend: Tests " ,
_check_state_series ( check_regex_tests , False ) ,
{ " h " : 6 , " w " : 3 , " x " : 0 , " y " : 39 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
139 ,
" Pass Trend: Coverage " ,
_check_state_series ( check_regex_coverage , False ) ,
{ " h " : 6 , " w " : 3 , " x " : 3 , " y " : 39 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
140 ,
" Pass Trend: LOC " ,
_check_state_series ( check_regex_loc , False ) ,
{ " h " : 6 , " w " : 3 , " x " : 6 , " y " : 39 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
141 ,
" Pass Trend: Style " ,
_check_state_series ( check_regex_style , False ) ,
{ " h " : 6 , " w " : 3 , " x " : 9 , " y " : 39 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
142 ,
" Pass Trend: Gate Glue " ,
_check_state_series ( check_regex_gate_glue , False ) ,
{ " h " : 6 , " w " : 3 , " x " : 12 , " y " : 39 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
143 ,
" Pass Trend: SonarQube " ,
_check_state_series ( check_regex_sonarqube , False ) ,
{ " h " : 6 , " w " : 3 , " x " : 15 , " y " : 39 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
panels . append (
timeseries_panel (
144 ,
" Pass Trend: Supply Chain " ,
_check_state_series ( check_regex_supply_chain , False ) ,
{ " h " : 6 , " w " : 3 , " x " : 18 , " y " : 39 } ,
unit = " none " ,
legend = " {{ suite}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " ] ,
)
)
2026-04-18 17:47:06 -03:00
panels . append (
2026-04-19 14:18:41 -03:00
bargauge_panel (
15 ,
" Latest Test Counters (Suite + Result) " ,
f ' sum by (suite, result) ( {{ { tests_selector } }} ) ' ,
2026-04-20 08:07:30 -03:00
{ " h " : 6 , " w " : 3 , " x " : 21 , " y " : 39 } ,
2026-04-18 17:47:06 -03:00
unit = " none " ,
instant = True ,
2026-04-19 14:18:41 -03:00
legend = " {{ suite}} · {{ result}} " ,
sort_order = " desc " ,
limit = 24 ,
2026-04-18 17:47:06 -03:00
)
)
2026-04-20 08:35:05 -03:00
panels . append (
timeseries_panel (
145 ,
" Problematic Tests Over Time (Top failures) " ,
problematic_tests_history ,
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 45 } ,
unit = " none " ,
legend = " {{ suite}} · {{ test}} " ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " max " , " sum " ] ,
2026-04-20 13:45:01 -03:00
links = jenkins_suite_links ( ) ,
2026-04-20 08:35:05 -03:00
)
)
panels . append (
timeseries_panel (
146 ,
" Selected Test Pass/Fail History " ,
None ,
{ " h " : 8 , " w " : 8 , " x " : 12 , " y " : 45 } ,
unit = " none " ,
targets = selected_test_pass_fail ,
legend_display = " list " ,
legend_placement = " bottom " ,
legend_calcs = [ " lastNotNull " , " sum " ] ,
2026-04-20 13:45:01 -03:00
links = jenkins_suite_links ( ) ,
2026-04-20 08:35:05 -03:00
)
)
panels . append (
bargauge_panel (
147 ,
" Most Problematic Test by Suite (30d) " ,
worst_test_per_suite ,
{ " h " : 8 , " w " : 4 , " x " : 20 , " y " : 45 } ,
unit = " none " ,
instant = True ,
legend = " {{ suite}} · {{ test}} " ,
sort_order = " desc " ,
thresholds = failures_thresholds ,
limit = 9 ,
2026-04-20 13:45:01 -03:00
links = jenkins_suite_links ( ) ,
2026-04-20 08:35:05 -03:00
)
)
2026-04-18 17:47:06 -03:00
2026-04-12 22:58:21 -03:00
coverage_panel = bargauge_panel (
2026-04-18 17:47:06 -03:00
17 ,
" Coverage by Suite (Latest, gate 95) " ,
coverage_with_missing ,
2026-04-20 08:35:05 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 53 } ,
2026-04-12 22:58:21 -03:00
unit = " percent " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " asc " ,
2026-04-19 14:18:41 -03:00
thresholds = success_thresholds ,
2026-04-12 22:58:21 -03:00
decimals = 2 ,
)
coverage_panel [ " fieldConfig " ] [ " defaults " ] [ " mappings " ] = [
2026-04-18 17:47:06 -03:00
{ " type " : " value " , " options " : { " -1 " : { " text " : " missing " } } }
2026-04-12 22:58:21 -03:00
]
panels . append ( coverage_panel )
2026-04-19 14:18:41 -03:00
2026-04-12 22:58:21 -03:00
smell_panel = bargauge_panel (
2026-04-18 17:47:06 -03:00
18 ,
" Files >500 LOC by Suite (Latest) " ,
smell_with_missing ,
2026-04-20 08:35:05 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 53 } ,
2026-04-12 22:58:21 -03:00
unit = " none " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
thresholds = smell_thresholds ,
)
smell_panel [ " fieldConfig " ] [ " defaults " ] [ " mappings " ] = [
2026-04-18 17:47:06 -03:00
{ " type " : " value " , " options " : { " -1 " : { " text " : " missing " } } }
2026-04-12 22:58:21 -03:00
]
panels . append ( smell_panel )
2026-01-18 02:50:07 -03:00
2026-04-19 14:18:41 -03:00
panels . append (
bargauge_panel (
27 ,
" Missing Tests Metrics by Suite " ,
missing_tests_by_suite ,
2026-04-20 08:35:05 -03:00
{ " h " : 7 , " w " : 6 , " x " : 0 , " y " : 61 } ,
2026-04-19 14:18:41 -03:00
unit = " none " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
thresholds = missing_thresholds ,
decimals = 0 ,
)
)
panels . append (
bargauge_panel (
28 ,
" Missing Checks Metrics by Suite " ,
missing_checks_by_suite ,
2026-04-20 08:35:05 -03:00
{ " h " : 7 , " w " : 6 , " x " : 6 , " y " : 61 } ,
2026-04-19 14:18:41 -03:00
unit = " none " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
thresholds = missing_thresholds ,
decimals = 0 ,
)
)
panels . append (
bargauge_panel (
29 ,
" Missing Coverage Metrics by Suite " ,
missing_coverage_by_suite ,
2026-04-20 08:35:05 -03:00
{ " h " : 7 , " w " : 6 , " x " : 12 , " y " : 61 } ,
2026-04-19 14:18:41 -03:00
unit = " none " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
thresholds = missing_thresholds ,
decimals = 0 ,
)
)
panels . append (
bargauge_panel (
30 ,
" Missing LOC Metrics by Suite " ,
missing_loc_by_suite ,
2026-04-20 08:35:05 -03:00
{ " h " : 7 , " w " : 6 , " x " : 18 , " y " : 61 } ,
2026-04-19 14:18:41 -03:00
unit = " none " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
thresholds = missing_thresholds ,
decimals = 0 ,
)
)
panels . append (
stat_panel (
31 ,
" SonarQube API Up " ,
" (max(sonarqube_up) or on() vector(0)) " ,
2026-04-20 08:35:05 -03:00
{ " h " : 6 , " w " : 4 , " x " : 0 , " y " : 68 } ,
2026-04-19 14:18:41 -03:00
unit = " none " ,
instant = True ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " green " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
32 ,
" Sonar Projects (Selected) " ,
f ' (count(sonarqube_project_quality_gate_pass {{ project_key=~ " { suite_var } " }} ) or on() vector(0)) ' ,
2026-04-20 08:35:05 -03:00
{ " h " : 6 , " w " : 4 , " x " : 4 , " y " : 68 } ,
2026-04-19 14:18:41 -03:00
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
panels . append (
stat_panel (
33 ,
" Sonar Gate Fetch Errors " ,
" (max(sonarqube_quality_gate_fetch_errors_total) or on() vector(0)) " ,
2026-04-20 08:35:05 -03:00
{ " h " : 6 , " w " : 4 , " x " : 8 , " y " : 68 } ,
2026-04-19 14:18:41 -03:00
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
sonar_status_mix_panel = pie_panel (
34 ,
" Sonar Gate Status Mix (Selected) " ,
2026-04-20 08:07:30 -03:00
f ' count by (status) (sonarqube_project_quality_gate_pass {{ project_key=~ " { suite_var } " }} ) ' ,
2026-04-20 08:35:05 -03:00
{ " h " : 6 , " w " : 6 , " x " : 12 , " y " : 68 } ,
2026-04-19 14:18:41 -03:00
)
sonar_status_mix_panel [ " targets " ] [ 0 ] [ " legendFormat " ] = " {{ status}} "
panels . append ( sonar_status_mix_panel )
panels . append (
bargauge_panel (
35 ,
" Projects Failing Sonar Gate " ,
2026-04-20 08:07:30 -03:00
f ' sort_desc(count by (project_key) (sonarqube_project_quality_gate_pass {{ project_key=~ " { suite_var } " ,status!~ " OK|ok " }} )) ' ,
2026-04-20 08:35:05 -03:00
{ " h " : 6 , " w " : 6 , " x " : 18 , " y " : 68 } ,
2026-04-19 14:18:41 -03:00
unit = " none " ,
instant = True ,
legend = " {{ project_key}} " ,
sort_order = " desc " ,
thresholds = failures_thresholds ,
)
)
2026-04-20 13:45:01 -03:00
panels . append (
bargauge_panel (
148 ,
" Missing Test-Case Metrics by Suite " ,
missing_test_case_by_suite ,
{ " h " : 6 , " w " : 24 , " x " : 0 , " y " : 74 } ,
unit = " none " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
thresholds = missing_thresholds ,
decimals = 0 ,
)
)
2026-04-21 09:35:43 -03:00
panels . append (
bargauge_panel (
149 ,
" Recent Branch Evidence by Suite (30d) " ,
recent_branch_evidence ,
{ " h " : 7 , " w " : 12 , " x " : 0 , " y " : 80 } ,
unit = " none " ,
instant = True ,
legend = " {{ suite}} · {{ branch}} " ,
sort_order = " desc " ,
thresholds = missing_thresholds ,
decimals = 0 ,
links = jenkins_suite_links ( ) ,
)
)
panels . append (
bargauge_panel (
150 ,
" Non-Primary Branch Evidence (30d) " ,
non_primary_branch_evidence ,
{ " h " : 7 , " w " : 12 , " x " : 12 , " y " : 80 } ,
unit = " none " ,
instant = True ,
legend = " {{ suite}} · {{ branch}} " ,
sort_order = " desc " ,
thresholds = failures_thresholds ,
decimals = 0 ,
links = jenkins_suite_links ( ) ,
)
)
2026-04-19 14:18:41 -03:00
2026-01-18 02:50:07 -03:00
return {
2026-04-19 14:18:41 -03:00
" uid " : " atlas-jobs " ,
2026-04-12 20:05:39 -03:00
" title " : " Atlas Testing " ,
2026-01-18 02:50:07 -03:00
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
2026-04-12 20:05:39 -03:00
" time " : { " from " : " now-30d " , " to " : " now " } ,
2026-01-18 02:50:07 -03:00
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
2026-04-19 14:18:41 -03:00
" tags " : [ " atlas " , " testing " , " quality-gate " , " ci " ] ,
" templating " : {
" list " : [
testing_suite_variable ( ) ,
2026-04-20 08:35:05 -03:00
testing_case_variable ( ) ,
2026-04-21 09:35:43 -03:00
testing_branch_variable ( ) ,
2026-04-20 13:45:01 -03:00
jenkins_base_variable ( ) ,
2026-04-19 14:18:41 -03:00
]
} ,
2026-01-18 02:50:07 -03:00
}
2026-04-03 14:55:16 -03:00
def build_power_dashboard ( ) :
panels = [ ]
2026-04-19 14:18:41 -03:00
status_mapping = [
{
" type " : " value " ,
" options " : {
" 0 " : { " text " : " ⚡ Charging " } ,
" 1 " : { " text " : " 🔋 Discharging " } ,
} ,
}
]
2026-04-03 14:55:16 -03:00
panels . append (
2026-04-12 19:56:12 -03:00
stat_panel (
2026-04-03 14:55:16 -03:00
1 ,
2026-04-03 20:45:40 -03:00
" UPS Current Load " ,
2026-04-19 14:18:41 -03:00
None ,
2026-04-03 20:45:40 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 0 } ,
2026-04-12 19:56:12 -03:00
unit = " none " ,
decimals = 1 ,
text_mode = " name_and_value " ,
2026-04-19 14:18:41 -03:00
targets = [
{ " refId " : " A " , " expr " : ANANKE_UPS_DRAW_WATTS_DB , " legendFormat " : f " { ANANKE_UPS_DB_NAME } Draw (W) " , " instant " : True } ,
{ " refId " : " B " , " expr " : ANANKE_UPS_RUNTIME_DB , " legendFormat " : f " { ANANKE_UPS_DB_NAME } Discharge " , " instant " : True } ,
{ " refId " : " C " , " expr " : ANANKE_UPS_ON_BATTERY_DB , " legendFormat " : f " { ANANKE_UPS_DB_NAME } Status " , " instant " : True } ,
{ " refId " : " D " , " expr " : ANANKE_UPS_DRAW_WATTS_TETHYS , " legendFormat " : f " { ANANKE_UPS_TETHYS_NAME } Draw (W) " , " instant " : True } ,
{ " refId " : " E " , " expr " : ANANKE_UPS_RUNTIME_TETHYS , " legendFormat " : f " { ANANKE_UPS_TETHYS_NAME } Discharge " , " instant " : True } ,
{ " refId " : " F " , " expr " : ANANKE_UPS_ON_BATTERY_TETHYS , " legendFormat " : f " { ANANKE_UPS_TETHYS_NAME } Status " , " instant " : True } ,
] ,
2026-04-03 20:45:40 -03:00
field_overrides = [
2026-04-19 14:18:41 -03:00
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_DB_NAME } Draw (W) " } ,
" properties " : [ { " id " : " unit " , " value " : " watt " } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_DB_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_TETHYS_NAME } Draw (W) " } ,
" properties " : [ { " id " : " unit " , " value " : " watt " } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_TETHYS_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_DB_NAME } Discharge " } ,
" properties " : [ { " id " : " unit " , " value " : " s " } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_DB_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_TETHYS_NAME } Discharge " } ,
" properties " : [ { " id " : " unit " , " value " : " s " } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_TETHYS_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_DB_NAME } Status " } ,
" properties " : [ { " id " : " mappings " , " value " : status_mapping } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_DB_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_TETHYS_NAME } Status " } ,
" properties " : [ { " id " : " mappings " , " value " : status_mapping } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_TETHYS_NODE } " } ] ,
} ,
2026-04-03 20:45:40 -03:00
] ,
2026-04-19 14:18:41 -03:00
orientation = " horizontal " ,
wide_layout = True ,
2026-04-03 20:45:40 -03:00
description = (
2026-04-19 14:18:41 -03:00
" Per-UPS live snapshot: current draw in watts, estimated battery runtime if discharge started now, and charging/discharging status. "
2026-04-03 20:45:40 -03:00
) ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-03 20:45:40 -03:00
timeseries_panel (
2026-04-03 14:55:16 -03:00
2 ,
2026-04-03 20:45:40 -03:00
" UPS History (Power Draw) " ,
None ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 0 } ,
unit = " watt " ,
targets = [
2026-04-08 23:33:17 -03:00
{ " refId " : " A " , " expr " : ANANKE_UPS_DRAW_WATTS_DB_SERIES , " legendFormat " : ANANKE_UPS_DB_NAME } ,
{ " refId " : " B " , " expr " : ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES , " legendFormat " : ANANKE_UPS_TETHYS_NAME } ,
2026-04-03 20:45:40 -03:00
] ,
legend_display = " table " ,
legend_placement = " right " ,
2026-04-20 08:07:30 -03:00
description = " Historical UPS power consumption in watts for titan-db and tethys. " ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-12 19:56:12 -03:00
stat_panel (
2026-04-03 14:55:16 -03:00
3 ,
2026-04-03 20:45:40 -03:00
" Current Climate " ,
2026-04-19 14:18:41 -03:00
None ,
2026-04-03 17:49:09 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 8 } ,
2026-04-12 19:56:12 -03:00
unit = " none " ,
decimals = 2 ,
text_mode = " name_and_value " ,
2026-04-19 14:18:41 -03:00
targets = [
{ " refId " : " A " , " expr " : CLIMATE_TEMP_MAX , " legendFormat " : " Tent Temp (°C) " , " instant " : True } ,
{ " refId " : " B " , " expr " : CLIMATE_PRESSURE_CURRENT , " legendFormat " : " Tent VPD (kPa) " , " instant " : True } ,
{ " refId " : " C " , " expr " : CLIMATE_HUMIDITY_MAX , " legendFormat " : " Tent RH ( % ) " , " instant " : True } ,
{ " refId " : " D " , " expr " : CLIMATE_DEWPOINT_CURRENT , " legendFormat " : " Dew Point (°C) " , " instant " : True } ,
] ,
2026-04-03 20:45:40 -03:00
field_overrides = [
2026-04-19 14:18:41 -03:00
{ " matcher " : { " id " : " byName " , " options " : " Tent Temp (°C) " } , " properties " : [ { " id " : " unit " , " value " : " celsius " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Tent VPD (kPa) " } , " properties " : [ { " id " : " unit " , " value " : " suffix:kPa " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Tent RH ( % ) " } , " properties " : [ { " id " : " unit " , " value " : " percent " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Dew Point (°C) " } , " properties " : [ { " id " : " unit " , " value " : " celsius " } ] } ,
2026-04-03 20:45:40 -03:00
] ,
2026-04-19 14:18:41 -03:00
orientation = " horizontal " ,
wide_layout = True ,
description = " Current tent temperature, humidity, VPD, and dew point. These render once Typhon climate telemetry is online. " ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
timeseries_panel (
2026-04-03 20:45:40 -03:00
4 ,
" Climate History " ,
None ,
2026-04-03 17:49:09 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 8 } ,
2026-04-19 14:18:41 -03:00
unit = " celsius " ,
2026-04-03 20:45:40 -03:00
targets = [
{ " refId " : " A " , " expr " : CLIMATE_TEMP_SERIES , " legendFormat " : " Temperature (°C) " } ,
2026-04-19 14:18:41 -03:00
{ " refId " : " B " , " expr " : CLIMATE_HUMIDITY_SERIES , " legendFormat " : " Humidity ( % ) " } ,
{ " refId " : " C " , " expr " : CLIMATE_PRESSURE_SERIES , " legendFormat " : " VPD (kPa) " } ,
{ " refId " : " D " , " expr " : CLIMATE_DEWPOINT_SERIES , " legendFormat " : " Dew Point (°C) " } ,
2026-04-03 20:45:40 -03:00
] ,
field_overrides = [
{
2026-04-12 17:28:15 -03:00
" matcher " : { " id " : " byName " , " options " : " Humidity ( % ) " } ,
" properties " : [
2026-04-19 14:18:41 -03:00
{ " id " : " unit " , " value " : " percent " } ,
2026-04-12 17:28:15 -03:00
] ,
} ,
{
2026-04-19 14:18:41 -03:00
" matcher " : { " id " : " byName " , " options " : " VPD (kPa) " } ,
2026-04-03 20:45:40 -03:00
" properties " : [
2026-04-19 14:18:41 -03:00
{ " id " : " unit " , " value " : " none " } ,
2026-04-03 20:45:40 -03:00
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
2026-04-19 14:18:41 -03:00
{ " id " : " custom.axisLabel " , " value " : " kPa " } ,
2026-04-03 20:45:40 -03:00
{ " id " : " decimals " , " value " : 2 } ,
] ,
}
] ,
2026-04-03 14:55:16 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2026-04-19 14:18:41 -03:00
description = " Two-axis chart: tent temperature/humidity/dew point (left axis) and tent VPD in kPa (right axis). " ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-12 19:56:12 -03:00
stat_panel (
2026-04-03 20:45:40 -03:00
5 ,
" Fan Activity " ,
2026-04-19 14:18:41 -03:00
None ,
2026-04-03 17:49:09 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 16 } ,
2026-04-12 19:56:12 -03:00
unit = " none " ,
decimals = 0 ,
text_mode = " name_and_value " ,
2026-04-19 14:18:41 -03:00
targets = [
{ " refId " : " A " , " expr " : f " round( { CLIMATE_FAN_OUTLET_CURRENT } ) " , " legendFormat " : " Inside Outlet " , " instant " : True } ,
{ " refId " : " B " , " expr " : f " round( { CLIMATE_FAN_INSIDE_INLET_CURRENT } ) " , " legendFormat " : " Inside Inlet " , " instant " : True } ,
{ " refId " : " C " , " expr " : f " round( { CLIMATE_FAN_OUTSIDE_INLET_CURRENT } ) " , " legendFormat " : " Outside Inlet " , " instant " : True } ,
{ " refId " : " D " , " expr " : f " round( { CLIMATE_FAN_INTERIOR_CURRENT } ) " , " legendFormat " : " Interior Fans " , " instant " : True } ,
] ,
2026-04-12 19:56:12 -03:00
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 7 } ,
{ " color " : " red " , " value " : 9 } ,
] ,
} ,
2026-04-19 14:18:41 -03:00
orientation = " horizontal " ,
wide_layout = True ,
description = " Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans. " ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-03 17:49:09 -03:00
timeseries_panel (
2026-04-03 20:45:40 -03:00
6 ,
" Fan History (0-10) " ,
None ,
2026-04-03 17:49:09 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 16 } ,
unit = " none " ,
2026-04-03 20:45:40 -03:00
max_value = 10 ,
targets = [
2026-04-19 14:18:41 -03:00
{ " refId " : " A " , " expr " : CLIMATE_FAN_OUTLET_SERIES , " legendFormat " : " Inside Outlet " } ,
{ " refId " : " B " , " expr " : CLIMATE_FAN_INSIDE_INLET_SERIES , " legendFormat " : " Inside Inlet " } ,
{ " refId " : " C " , " expr " : CLIMATE_FAN_OUTSIDE_INLET_SERIES , " legendFormat " : " Outside Inlet " } ,
{ " refId " : " D " , " expr " : CLIMATE_FAN_INTERIOR_SERIES , " legendFormat " : " Interior Fans " } ,
2026-04-03 20:45:40 -03:00
] ,
2026-04-03 17:49:09 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2026-04-03 20:45:40 -03:00
description = " Historical fan activity for all four fan groups (0-10 scale). " ,
2026-04-03 14:55:16 -03:00
)
)
return {
" uid " : " atlas-power " ,
" title " : " Atlas Power " ,
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-24h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " power " , " climate " ] ,
}
2025-12-02 13:16:00 -03:00
def build_gpu_dashboard ( ) :
panels = [ ]
2026-01-01 14:44:33 -03:00
gpu_scope = " $namespace_scope_gpu "
2025-12-02 13:16:00 -03:00
panels . append (
pie_panel (
1 ,
2025-12-02 14:41:39 -03:00
" Namespace GPU Share " ,
2026-01-01 14:44:33 -03:00
namespace_gpu_share_expr ( gpu_scope ) ,
2025-12-02 13:16:00 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 0 } ,
2026-01-01 14:44:33 -03:00
links = namespace_scope_links ( " namespace_scope_gpu " ) ,
2026-01-18 02:50:07 -03:00
description = " Shares are normalized within the selected filter. Switching scope changes the denominator. " ,
2025-12-02 13:16:00 -03:00
)
)
panels . append (
timeseries_panel (
2 ,
2025-12-02 14:41:39 -03:00
" GPU Util by Namespace " ,
2026-01-01 14:44:33 -03:00
namespace_gpu_usage_instant ( gpu_scope ) ,
2025-12-02 13:16:00 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 0 } ,
unit = " percent " ,
legend = " {{ namespace}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
3 ,
2025-12-02 14:41:39 -03:00
" GPU Util by Node " ,
2026-01-27 21:43:37 -03:00
gpu_util_by_hostname ( ) ,
2025-12-02 13:16:00 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 8 } ,
unit = " percent " ,
legend = " {{ Hostname}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
table_panel (
4 ,
2025-12-02 14:41:39 -03:00
" Top Pods by GPU Util " ,
2025-12-02 13:16:00 -03:00
' topk(10, sum(DCGM_FI_DEV_GPU_UTIL { pod!= " " }) by (namespace,pod,Hostname)) ' ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 8 } ,
unit = " percent " ,
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
return {
" uid " : " atlas-gpu " ,
" title " : " Atlas GPU " ,
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " gpu " ] ,
2026-01-01 14:44:33 -03:00
" templating " : {
" list " : [
namespace_scope_variable ( " namespace_scope_cpu " , " CPU namespace filter " ) ,
namespace_scope_variable ( " namespace_scope_gpu " , " GPU namespace filter " ) ,
namespace_scope_variable ( " namespace_scope_ram " , " RAM namespace filter " ) ,
]
} ,
2025-12-02 13:16:00 -03:00
}
2025-11-17 14:22:46 -03:00
DASHBOARDS = {
" atlas-overview " : {
" builder " : build_overview ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-overview.yaml " ,
} ,
" atlas-pods " : {
" builder " : build_pods_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-pods.yaml " ,
} ,
" atlas-nodes " : {
" builder " : build_nodes_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-nodes.yaml " ,
} ,
" atlas-storage " : {
" builder " : build_storage_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-storage.yaml " ,
} ,
2025-11-17 16:27:38 -03:00
" atlas-network " : {
" builder " : build_network_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-network.yaml " ,
} ,
2026-01-05 21:55:59 -03:00
" atlas-mail " : {
" builder " : build_mail_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-mail.yaml " ,
} ,
2026-01-21 13:37:36 -03:00
" atlas-jobs " : {
" builder " : build_jobs_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-jobs.yaml " ,
2026-01-18 02:50:07 -03:00
} ,
2026-04-03 14:55:16 -03:00
" atlas-power " : {
" builder " : build_power_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-power.yaml " ,
} ,
2025-12-02 13:16:00 -03:00
" atlas-gpu " : {
" builder " : build_gpu_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-gpu.yaml " ,
} ,
2025-11-17 14:22:46 -03:00
}
2025-11-17 16:27:38 -03:00
def write_json ( uid , data ) :
2025-11-17 14:22:46 -03:00
DASHBOARD_DIR . mkdir ( parents = True , exist_ok = True )
path = DASHBOARD_DIR / f " { uid } .json "
path . write_text ( json . dumps ( data , indent = 2 ) + " \n " )
2025-11-17 16:27:38 -03:00
def render_configmap ( uid , info ) :
2025-11-17 14:22:46 -03:00
json_path = DASHBOARD_DIR / f " { uid } .json "
payload = json . dumps ( json . loads ( json_path . read_text ( ) ) , indent = 2 )
indented = " \n " . join ( " " + line for line in payload . splitlines ( ) )
2025-11-17 16:27:38 -03:00
output_path = info [ " configmap " ]
2025-11-17 14:22:46 -03:00
content = CONFIG_TEMPLATE . format (
relative_path = output_path . relative_to ( ROOT ) ,
name = output_path . stem ,
key = json_path . name ,
payload = indented ,
)
output_path . write_text ( content )
print ( f " Rendered { json_path . name } -> { output_path . relative_to ( ROOT ) } " )
def main ( ) :
parser = argparse . ArgumentParser ( description = __doc__ )
parser . add_argument ( " --build " , action = " store_true " , help = " Regenerate dashboard JSON files from builders " )
args = parser . parse_args ( )
if args . build :
for uid , info in DASHBOARDS . items ( ) :
write_json ( uid , info [ " builder " ] ( ) )
for uid , info in DASHBOARDS . items ( ) :
render_configmap ( uid , info )
if __name__ == " __main__ " :
main ( )