2025-11-17 14:22:46 -03:00
#!/usr/bin/env python3
2025-11-17 16:27:38 -03:00
""" Generate Atlas Grafana dashboards and render them into ConfigMaps.
2025-11-17 14:22:46 -03:00
Usage :
2025-12-02 13:16:00 -03:00
scripts / dashboards_render_atlas . py - - build # rebuild JSON + ConfigMaps
scripts / dashboards_render_atlas . py # re-render ConfigMaps from JSON
2025-11-17 14:22:46 -03:00
"""
2025-11-17 16:27:38 -03:00
2025-11-17 14:22:46 -03:00
import argparse
import json
import textwrap
2026-01-01 14:44:33 -03:00
import urllib . parse
2025-11-17 14:22:46 -03:00
from pathlib import Path
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Paths, folders, and shared metadata
# ---------------------------------------------------------------------------
2025-11-17 14:22:46 -03:00
ROOT = Path ( __file__ ) . resolve ( ) . parents [ 1 ]
DASHBOARD_DIR = ROOT / " services " / " monitoring " / " dashboards "
CONFIG_TEMPLATE = textwrap . dedent (
""" # {relative_path}
apiVersion : v1
kind : ConfigMap
metadata :
name : { name }
labels :
grafana_dashboard : " 1 "
data :
{ key } : |
{ payload }
"""
)
PROM_DS = { " type " : " prometheus " , " uid " : " atlas-vm " }
2025-12-02 14:41:39 -03:00
PUBLIC_FOLDER = " overview "
2026-05-16 02:56:52 -03:00
PUBLIC_DASHBOARD_FOLDER = " atlas-public "
2025-11-17 16:27:38 -03:00
PRIVATE_FOLDER = " atlas-internal "
2026-04-11 11:54:43 -03:00
ASTRAIOS_MOUNTPOINT = " /mnt/astraios "
2026-05-15 19:52:46 -03:00
GLOBAL_STATUS_COLOR_TONES = {
" blue " : " dark-blue " ,
" green " : " dark-green " ,
" yellow " : " dark-yellow " ,
" orange " : " dark-orange " ,
" red " : " dark-red " ,
}
COLOR_VALUE_KEYS = { " color " , " fixedColor " }
def apply_global_status_palette ( value , parent_key = None ) :
""" Normalize generated Grafana status colors to the shared Atlas tones. """
if isinstance ( value , dict ) :
return { key : apply_global_status_palette ( item , key ) for key , item in value . items ( ) }
if isinstance ( value , list ) :
return [ apply_global_status_palette ( item , parent_key ) for item in value ]
if parent_key in COLOR_VALUE_KEYS and isinstance ( value , str ) :
return GLOBAL_STATUS_COLOR_TONES . get ( value , value )
return value
2025-11-17 16:27:38 -03:00
PERCENT_THRESHOLDS = {
2025-12-12 21:13:31 -03:00
" mode " : " absolute " ,
2025-11-17 16:27:38 -03:00
" steps " : [
{ " color " : " green " , " value " : None } ,
2025-12-12 21:13:31 -03:00
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 75 } ,
{ " color " : " red " , " value " : 91.5 } ,
2025-11-17 16:27:38 -03:00
] ,
}
2026-01-05 13:30:33 -03:00
NAMESPACE_CPU_WINDOW = " 1m "
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Cluster metadata
# ---------------------------------------------------------------------------
CONTROL_PLANE_NODES = [ " titan-0a " , " titan-0b " , " titan-0c " ]
2026-01-06 09:50:40 -03:00
CONTROL_DEPENDENCIES = [ " titan-db " , " titan-jh " ]
2025-11-17 16:27:38 -03:00
CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
WORKER_NODES = [
" titan-04 " ,
" titan-05 " ,
" titan-06 " ,
" titan-07 " ,
" titan-08 " ,
" titan-09 " ,
" titan-10 " ,
" titan-11 " ,
2026-01-11 02:02:47 -03:00
" titan-20 " ,
" titan-21 " ,
2025-11-17 16:27:38 -03:00
" titan-12 " ,
" titan-13 " ,
" titan-14 " ,
" titan-15 " ,
2026-01-21 14:30:55 -03:00
" titan-16 " ,
2025-11-17 16:27:38 -03:00
" titan-17 " ,
" titan-18 " ,
" titan-19 " ,
" titan-22 " ,
" titan-24 " ,
]
CONTROL_REGEX = " | " . join ( CONTROL_PLANE_NODES )
CONTROL_ALL_REGEX = " | " . join ( CONTROL_ALL )
WORKER_REGEX = " | " . join ( WORKER_NODES )
CONTROL_TOTAL = len ( CONTROL_PLANE_NODES )
WORKER_TOTAL = len ( WORKER_NODES )
CONTROL_SUFFIX = f " / { CONTROL_TOTAL } "
WORKER_SUFFIX = f " / { WORKER_TOTAL } "
2026-01-11 23:46:24 -03:00
# Namespaces considered infrastructure (excluded from workload counts)
2026-01-18 02:50:07 -03:00
INFRA_PATTERNS = [
" kube-.* " ,
" .*-system " ,
" traefik " ,
2026-01-11 23:46:24 -03:00
" monitoring " ,
2026-01-11 23:52:40 -03:00
" logging " ,
2026-01-12 00:26:46 -03:00
" cert-manager " ,
2026-01-11 23:52:40 -03:00
" maintenance " ,
" postgres " ,
2026-01-11 23:46:24 -03:00
]
2026-01-18 02:50:07 -03:00
INFRA_REGEX = f " ^( { ' | ' . join ( INFRA_PATTERNS ) } )$ "
2026-01-11 23:46:24 -03:00
# Namespaces allowed on control plane without counting as workloads
CP_ALLOWED_NS = INFRA_REGEX
2025-11-17 18:55:11 -03:00
LONGHORN_NODE_REGEX = " titan-1[2-9]|titan-2[24] "
2025-12-12 15:23:51 -03:00
GAUGE_WIDTHS = [ 4 , 3 , 3 , 4 , 3 , 3 , 4 ]
2025-11-18 17:09:13 -03:00
CONTROL_WORKLOADS_EXPR = (
f ' sum(kube_pod_info {{ node=~ " { CONTROL_REGEX } " ,namespace!~ " { CP_ALLOWED_NS } " }} ) or on() vector(0) '
)
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# PromQL helpers
# ---------------------------------------------------------------------------
NODE_INFO = ' label_replace(node_uname_info { nodename!= " " }, " node " , " $1 " , " nodename " , " (.*) " ) '
def node_filter ( regex ) :
""" Return a selector that evaluates to 1 for nodes matching the regex. """
return (
f ' label_replace(node_uname_info {{ nodename=~ " { regex } " }} , '
' " node " , " $1 " , " nodename " , " (.*) " ) '
)
2025-11-17 14:22:46 -03:00
2025-11-17 16:27:38 -03:00
def scoped_node_expr ( base , scope = " " ) :
""" Attach nodename metadata and optionally filter to a scope regex. """
expr = f " avg by (node) (( { base } ) * on(instance) group_left(node) { NODE_INFO } ) "
if scope :
expr = f " ( { expr } ) * on(node) group_left() { node_filter ( scope ) } "
return expr
def node_cpu_expr ( scope = " " ) :
idle = ' avg by (instance) (rate(node_cpu_seconds_total { mode= " idle " }[5m])) '
base = f " (1 - { idle } ) * 100 "
return scoped_node_expr ( base , scope )
def node_mem_expr ( scope = " " ) :
usage = (
" avg by (instance) ( "
" (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) "
" / node_memory_MemTotal_bytes * 100) "
)
return scoped_node_expr ( usage , scope )
def filesystem_usage_expr ( mount , scope = " " ) :
base = (
f ' avg by (instance) ( '
f ' (1 - (node_filesystem_avail_bytes {{ mountpoint= " { mount } " ,fstype!~ " tmpfs|overlay " }} '
f ' / node_filesystem_size_bytes {{ mountpoint= " { mount } " ,fstype!~ " tmpfs|overlay " }} )) * 100) '
)
return scoped_node_expr ( base , scope )
def root_usage_expr ( scope = " " ) :
return filesystem_usage_expr ( " / " , scope )
2026-04-11 11:54:43 -03:00
def astraios_usage_expr ( scope = " " ) :
return filesystem_usage_expr ( ASTRAIOS_MOUNTPOINT , scope )
2025-11-17 16:27:38 -03:00
def astreae_usage_expr ( mount ) :
return (
f " 100 - (sum(node_filesystem_avail_bytes {{ mountpoint= \" { mount } \" ,fstype!~ \" tmpfs|overlay \" }} ) / "
f " sum(node_filesystem_size_bytes {{ mountpoint= \" { mount } \" ,fstype!~ \" tmpfs|overlay \" }} ) * 100) "
)
def astreae_free_expr ( mount ) :
return f " sum(node_filesystem_avail_bytes {{ mountpoint= \" { mount } \" ,fstype!~ \" tmpfs|overlay \" }} ) "
2025-11-17 20:19:20 -03:00
def topk_with_node ( expr ) :
2025-11-17 23:42:55 -03:00
return f ' label_replace(topk(1, { expr } ), " __name__ " , " $1 " , " node " , " (.*) " ) '
2025-11-17 20:19:20 -03:00
2025-11-17 20:14:11 -03:00
def node_net_expr ( scope = " " ) :
base = (
' sum by (instance) ( '
2025-11-17 21:20:19 -03:00
' rate(node_network_receive_bytes_total { device!~ " lo " }[5m]) '
' + rate(node_network_transmit_bytes_total { device!~ " lo " }[5m])) '
2025-11-17 20:14:11 -03:00
)
return scoped_node_expr ( base , scope )
def node_io_expr ( scope = " " ) :
base = (
" sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
" + rate(node_disk_written_bytes_total[5m])) "
)
return scoped_node_expr ( base , scope )
2026-01-01 14:44:33 -03:00
def namespace_selector ( scope_var ) :
2026-01-05 13:30:33 -03:00
return f ' namespace!= " " ,pod!= " " ,container!= " " ,container!= " POD " , { scope_var } '
2026-01-01 14:44:33 -03:00
def namespace_gpu_selector ( scope_var ) :
return f ' namespace!= " " ,pod!= " " , { scope_var } '
def namespace_cpu_raw ( scope_var ) :
2026-01-05 13:30:33 -03:00
return (
" sum(rate(container_cpu_usage_seconds_total "
f " {{ { namespace_selector ( scope_var ) } }} [ { NAMESPACE_CPU_WINDOW } ])) by (namespace) "
)
2026-01-01 14:44:33 -03:00
def namespace_ram_raw ( scope_var ) :
return f " sum(container_memory_working_set_bytes {{ { namespace_selector ( scope_var ) } }} ) by (namespace) "
def namespace_gpu_usage_instant ( scope_var ) :
2026-01-27 21:43:37 -03:00
return gpu_usage_by_namespace ( scope_var )
2026-01-26 22:26:24 -03:00
def jetson_gpu_util_by_node ( ) :
return ' max by (node) (jetson_gr3d_freq_percent { node!= " " }) '
2026-01-27 21:43:37 -03:00
def dcgm_gpu_util_by_node ( ) :
dcgm_pod = ' label_replace(DCGM_FI_DEV_GPU_UTIL, " pod " , " $1 " , " Hostname " , " (.*) " ) '
dcgm_ns = ' label_replace( ' + dcgm_pod + ' , " namespace " , " monitoring " , " " , " " ) '
2026-01-27 16:19:30 -03:00
return (
2026-01-27 21:43:37 -03:00
" avg by (node) ( "
f " { dcgm_ns } * on(namespace,pod) group_left(node) "
' kube_pod_info { namespace= " monitoring " } '
" ) "
2026-01-27 16:19:30 -03:00
)
2026-01-27 21:43:37 -03:00
def gpu_util_by_node ( ) :
return f " { dcgm_gpu_util_by_node ( ) } or { jetson_gpu_util_by_node ( ) } "
def gpu_util_by_hostname ( ) :
return ' label_replace( ' + gpu_util_by_node ( ) + ' , " Hostname " , " $1 " , " node " , " (.*) " ) '
2026-05-16 05:58:59 -03:00
GPU_RESOURCE_REGEX = " nvidia(_com_|[.]com/)gpu.* "
2026-01-27 21:43:37 -03:00
def gpu_node_labels ( ) :
2026-05-16 05:58:59 -03:00
return f ' max by (node) (kube_node_status_allocatable {{ resource=~ " { GPU_RESOURCE_REGEX } " }} > bool 0) '
2026-01-27 21:43:37 -03:00
def gpu_requests_by_namespace_node ( scope_var ) :
2026-01-26 22:26:24 -03:00
return (
" sum by (namespace,node) ( "
2026-05-16 05:58:59 -03:00
f ' kube_pod_container_resource_requests {{ resource=~ " { GPU_RESOURCE_REGEX } " , { scope_var } }} '
2026-01-26 22:26:24 -03:00
" * on(namespace,pod) group_left(node) kube_pod_info "
2026-01-27 21:46:58 -03:00
f " * on(node) group_left() ( { gpu_node_labels ( ) } ) "
2026-01-27 21:43:37 -03:00
" ) "
)
def gpu_usage_by_namespace ( scope_var ) :
requests_by_ns = gpu_requests_by_namespace_node ( scope_var )
total_by_node = f " sum by (node) ( { requests_by_ns } ) "
return (
" sum by (namespace) ( "
2026-05-16 05:58:59 -03:00
f " ( { requests_by_ns } ) / on(node) group_left() clamp_min( { total_by_node } , 1) "
2026-01-27 21:46:58 -03:00
f " * on(node) group_left() ( { gpu_util_by_node ( ) } ) "
2026-01-26 22:26:24 -03:00
" ) "
)
def jetson_gpu_usage_by_namespace ( scope_var ) :
2026-05-11 01:01:46 -03:00
requests_by_ns = gpu_requests_by_namespace_node ( scope_var )
2026-01-26 22:26:24 -03:00
total_by_node = f " sum by (node) ( { requests_by_ns } ) "
return (
" sum by (namespace) ( "
2026-05-16 05:58:59 -03:00
f " ( { requests_by_ns } ) / on(node) group_left() clamp_min( { total_by_node } , 1) "
2026-01-26 22:26:24 -03:00
f " * on(node) group_left() { jetson_gpu_util_by_node ( ) } "
" ) "
)
2026-01-01 14:44:33 -03:00
2025-11-18 14:08:33 -03:00
def namespace_share_expr ( resource_expr ) :
2026-01-01 14:16:08 -03:00
total = f " clamp_min(sum( { resource_expr } ), 1) "
return f " 100 * ( { resource_expr } ) / { total } "
2025-11-17 21:57:40 -03:00
2026-01-01 14:44:33 -03:00
def namespace_cpu_share_expr ( scope_var ) :
return namespace_share_expr ( namespace_cpu_raw ( scope_var ) )
2025-11-18 14:08:33 -03:00
2026-01-01 14:44:33 -03:00
def namespace_ram_share_expr ( scope_var ) :
return namespace_share_expr ( namespace_ram_raw ( scope_var ) )
2025-11-18 00:11:39 -03:00
2026-01-01 14:44:33 -03:00
def namespace_gpu_share_expr ( scope_var ) :
usage = namespace_gpu_usage_instant ( scope_var )
total = f " (sum( { usage } ) or on() vector(0)) "
share = f " 100 * ( { usage } ) / clamp_min( { total } , 1) "
2026-01-27 18:44:58 -03:00
idle = ' label_replace(vector(100), " namespace " , " idle " , " " , " " ) * scalar( ' + total + " == bool 0) "
2026-01-01 14:21:43 -03:00
return f " ( { share } ) or ( { idle } ) "
2025-11-17 23:12:16 -03:00
2025-12-12 20:30:00 -03:00
PROBLEM_PODS_EXPR = (
' sum(max by (namespace,pod) (kube_pod_status_phase { phase!~ " Running|Succeeded " })) '
" or on() vector(0) "
)
2025-11-17 16:27:38 -03:00
CRASHLOOP_EXPR = (
' sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason '
2025-12-12 20:30:00 -03:00
' { reason=~ " CrashLoopBackOff|ImagePullBackOff " })) '
" or on() vector(0) "
2025-11-17 16:27:38 -03:00
)
STUCK_TERMINATING_EXPR = (
2025-11-17 18:55:11 -03:00
' sum(max by (namespace,pod) ( '
' ((time() - kube_pod_deletion_timestamp { pod!= " " }) > bool 600) '
' and on(namespace,pod) (kube_pod_deletion_timestamp { pod!= " " } > bool 0) '
2025-12-12 20:30:00 -03:00
' )) '
" or on() vector(0) "
2025-11-17 16:27:38 -03:00
)
2026-05-10 15:40:12 -03:00
UPTIME_WINDOW = " 365d "
# vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series.
2026-05-10 16:13:13 -03:00
UPTIME_RECORDING_METRIC = f ' atlas:availability:ratio_ { UPTIME_WINDOW } {{ scope= " atlas " }} '
2026-05-15 22:07:41 -03:00
UPTIME_RECORDING_EXPR = f " last_over_time( { UPTIME_RECORDING_METRIC } [24h]) "
2025-12-12 15:56:33 -03:00
TRAEFIK_READY_EXPR = (
" ( "
' sum(kube_deployment_status_replicas_available { namespace=~ " traefik|kube-system " ,deployment= " traefik " }) '
" / clamp_min( "
' sum(kube_deployment_spec_replicas { namespace=~ " traefik|kube-system " ,deployment= " traefik " }), 1) '
" ) "
)
CONTROL_READY_FRACTION_EXPR = (
f " (sum(kube_node_status_condition {{ condition= \" Ready \" ,status= \" true \" ,node=~ \" { CONTROL_REGEX } \" }} ) "
f " / { CONTROL_TOTAL } ) "
)
UPTIME_AVAIL_EXPR = (
f " min(( { CONTROL_READY_FRACTION_EXPR } ), ( { TRAEFIK_READY_EXPR } )) "
)
2025-12-13 15:51:45 -03:00
# Tie-breaker to deterministically pick one node per namespace when shares tie.
NODE_TIEBREAKER = " + " . join (
f " ( { node_filter ( node ) } ) * 1e-6 * { idx } "
for idx , node in enumerate ( CONTROL_ALL + WORKER_NODES , start = 1 )
)
2026-05-10 15:40:12 -03:00
UPTIME_AVG_EXPR = UPTIME_RECORDING_EXPR
2025-12-12 16:36:47 -03:00
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
2025-12-12 15:56:33 -03:00
UPTIME_NINES_EXPR = f " -log10(1 - clamp_max( { UPTIME_AVG_EXPR } , 0.999999999)) "
2025-12-12 15:23:51 -03:00
UPTIME_THRESHOLDS = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " yellow " , " value " : 3 } ,
{ " color " : " green " , " value " : 3.5 } ,
] ,
}
2025-12-12 16:11:28 -03:00
UPTIME_PERCENT_THRESHOLDS = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
2025-12-15 22:14:26 -03:00
{ " color " : " orange " , " value " : 0.99 } ,
{ " color " : " yellow " , " value " : 0.999 } ,
{ " color " : " green " , " value " : 0.9999 } ,
{ " color " : " blue " , " value " : 0.99999 } ,
2025-12-12 16:11:28 -03:00
] ,
}
2025-11-17 16:27:38 -03:00
PROBLEM_TABLE_EXPR = (
" (time() - kube_pod_created { pod!= \" \" }) "
" * on(namespace,pod) group_left(node) kube_pod_info "
" * on(namespace,pod) group_left(phase) "
" max by (namespace,pod,phase) (kube_pod_status_phase { phase!~ \" Running|Succeeded \" }) "
)
CRASHLOOP_TABLE_EXPR = (
" (time() - kube_pod_created { pod!= \" \" }) "
" * on(namespace,pod) group_left(node) kube_pod_info "
" * on(namespace,pod,container) group_left(reason) "
" max by (namespace,pod,container,reason) "
" (kube_pod_container_status_waiting_reason { reason=~ \" CrashLoopBackOff|ImagePullBackOff \" }) "
)
STUCK_TABLE_EXPR = (
2025-11-17 18:55:11 -03:00
" ( "
2025-11-17 16:27:38 -03:00
" ((time() - kube_pod_deletion_timestamp { pod!= \" \" }) "
2025-11-17 18:55:11 -03:00
" and on(namespace,pod) (kube_pod_deletion_timestamp { pod!= \" \" } > bool 0)) "
" * on(namespace,pod) group_left(node) kube_pod_info "
" ) "
2025-11-17 16:27:38 -03:00
)
2026-01-11 23:46:24 -03:00
NAMESPACE_SCOPE_WORKLOAD = f ' namespace!~ " { INFRA_REGEX } " '
2026-01-01 14:16:08 -03:00
NAMESPACE_SCOPE_ALL = ' namespace=~ " .* " '
2026-01-11 23:46:24 -03:00
NAMESPACE_SCOPE_INFRA = f ' namespace=~ " { INFRA_REGEX } " '
2026-01-01 14:44:33 -03:00
NAMESPACE_SCOPE_VARS = [ " namespace_scope_cpu " , " namespace_scope_gpu " , " namespace_scope_ram " ]
2026-04-19 14:18:41 -03:00
def promql_task_regex ( tasks ) :
""" Return a PromQL-safe regex alternation for the provided task names. """
return " | " . join ( tasks )
ARIADNE_ALL_SCHEDULE_TASKS = [
" schedule.mailu_sync " ,
" schedule.nextcloud_sync " ,
" schedule.nextcloud_cron " ,
" schedule.nextcloud_maintenance " ,
" schedule.vaultwarden_sync " ,
" schedule.wger_user_sync " ,
" schedule.wger_admin " ,
" schedule.firefly_user_sync " ,
" schedule.firefly_cron " ,
" schedule.vault_k8s_auth " ,
" schedule.vault_oidc " ,
" schedule.comms_guest_name " ,
" schedule.comms_pin_invite " ,
" schedule.comms_reset_room " ,
" schedule.comms_seed_room " ,
" schedule.pod_cleaner " ,
" schedule.opensearch_prune " ,
" schedule.image_sweeper " ,
" schedule.metis_k3s_token_sync " ,
" schedule.platform_quality_suite_probe " ,
]
ARIADNE_FAST_SCHEDULE_TASKS = [
task
for task in ARIADNE_ALL_SCHEDULE_TASKS
if task not in { " schedule.comms_pin_invite " , " schedule.comms_reset_room " }
]
ARIADNE_SCHEDULE_HEALTH_TASKS = [
" schedule.nextcloud_sync " ,
" schedule.nextcloud_cron " ,
" schedule.vaultwarden_sync " ,
" schedule.wger_user_sync " ,
" schedule.firefly_user_sync " ,
" schedule.comms_guest_name " ,
" schedule.comms_seed_room " ,
" schedule.pod_cleaner " ,
" schedule.image_sweeper " ,
" schedule.metis_k3s_token_sync " ,
" schedule.platform_quality_suite_probe " ,
]
ARIADNE_ALL_SCHEDULE_FILTER = f ' task=~ " ^( { promql_task_regex ( ARIADNE_ALL_SCHEDULE_TASKS ) } )$ " '
ARIADNE_FAST_SCHEDULE_FILTER = f ' task=~ " ^( { promql_task_regex ( ARIADNE_FAST_SCHEDULE_TASKS ) } )$ " '
ARIADNE_SCHEDULE_HEALTH_FILTER = f ' task=~ " ^( { promql_task_regex ( ARIADNE_SCHEDULE_HEALTH_TASKS ) } )$ " '
ARIADNE_ALL_SCHEDULE_NEXT_RUN = f " ariadne_schedule_next_run_timestamp_seconds {{ { ARIADNE_ALL_SCHEDULE_FILTER } }} "
ARIADNE_ALL_SCHEDULE_LAST_SUCCESS = (
f " ariadne_schedule_last_success_timestamp_seconds {{ { ARIADNE_ALL_SCHEDULE_FILTER } }} "
)
ARIADNE_ALL_SCHEDULE_LAST_ERROR = f " ariadne_schedule_last_error_timestamp_seconds {{ { ARIADNE_ALL_SCHEDULE_FILTER } }} "
ARIADNE_ALL_SCHEDULE_LAST_STATUS = f " ariadne_schedule_last_status {{ { ARIADNE_ALL_SCHEDULE_FILTER } }} "
ARIADNE_FAST_SCHEDULE_LAST_SUCCESS = (
f " ariadne_schedule_last_success_timestamp_seconds {{ { ARIADNE_FAST_SCHEDULE_FILTER } }} "
)
ARIADNE_FAST_SCHEDULE_LAST_ERROR = f " ariadne_schedule_last_error_timestamp_seconds {{ { ARIADNE_FAST_SCHEDULE_FILTER } }} "
ARIADNE_FAST_SCHEDULE_LAST_STATUS = f " ariadne_schedule_last_status {{ { ARIADNE_FAST_SCHEDULE_FILTER } }} "
ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS = (
f " ariadne_schedule_last_success_timestamp_seconds {{ { ARIADNE_SCHEDULE_HEALTH_FILTER } }} "
)
ARIADNE_HEALTH_SCHEDULE_LAST_STATUS = f " ariadne_schedule_last_status {{ { ARIADNE_SCHEDULE_HEALTH_FILTER } }} "
ARIADNE_SCHEDULE_LAST_SUCCESS_AGE = f " (time() - { ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS } ) "
ARIADNE_SCHEDULE_LAST_ERROR_AGE = f " (time() - { ARIADNE_ALL_SCHEDULE_LAST_ERROR } ) "
ARIADNE_SCHEDULE_LAST_SUCCESS_AGE_HOURS = f " ( { ARIADNE_SCHEDULE_LAST_SUCCESS_AGE } ) / 3600 "
ARIADNE_SCHEDULE_LAST_ERROR_AGE_HOURS = f " ( { ARIADNE_SCHEDULE_LAST_ERROR_AGE } ) / 3600 "
ARIADNE_SCHEDULE_STALE_WINDOW_SEC = 36 * 3600
ARIADNE_SCHEDULE_STALE = f " (( { ARIADNE_SCHEDULE_LAST_SUCCESS_AGE } ) > bool { ARIADNE_SCHEDULE_STALE_WINDOW_SEC } ) "
ARIADNE_SCHEDULE_MISSING = (
f " ( { ARIADNE_ALL_SCHEDULE_NEXT_RUN } unless on(task) { ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS } ) "
)
ARIADNE_SCHEDULE_FAILED = f " ((1 - { ARIADNE_HEALTH_SCHEDULE_LAST_STATUS } ) > bool 0) "
ARIADNE_SCHEDULE_STALE_COUNT = f " sum( { ARIADNE_SCHEDULE_STALE } ) or on() vector(0) "
ARIADNE_SCHEDULE_MISSING_COUNT = f " count( { ARIADNE_SCHEDULE_MISSING } ) or on() vector(0) "
ARIADNE_SCHEDULE_FAILED_COUNT = f " sum( { ARIADNE_SCHEDULE_FAILED } ) or on() vector(0) "
2026-01-21 14:30:55 -03:00
ARIADNE_TASK_ERRORS_RANGE = ' sum by (task) (increase(ariadne_task_runs_total { status= " error " }[$__range])) '
2026-01-19 16:58:02 -03:00
ARIADNE_TASK_ERRORS_24H = ' sum by (task) (increase(ariadne_task_runs_total { status= " error " }[24h])) '
2026-01-21 13:37:36 -03:00
ARIADNE_TASK_ERRORS_1H = ' sum by (task) (increase(ariadne_task_runs_total { status= " error " }[1h])) '
ARIADNE_TASK_ERRORS_30D = ' sum by (task) (increase(ariadne_task_runs_total { status= " error " }[30d])) '
2026-01-19 16:58:02 -03:00
ARIADNE_TASK_SUCCESS_24H = ' sum by (task) (increase(ariadne_task_runs_total { status= " ok " }[24h])) '
2026-01-21 02:57:40 -03:00
ARIADNE_TASK_RUNS_BY_STATUS_1H = ' sum by (status) (increase(ariadne_task_runs_total[1h])) '
2026-01-21 11:29:29 -03:00
ARIADNE_TASK_ERRORS_1H_TOTAL = ' sum(increase(ariadne_task_runs_total { status= " error " }[1h])) '
ARIADNE_TASK_ERRORS_24H_TOTAL = ' sum(increase(ariadne_task_runs_total { status= " error " }[24h])) '
ARIADNE_TASK_RUNS_1H_TOTAL = ' sum(increase(ariadne_task_runs_total[1h])) '
2026-05-15 22:43:44 -03:00
ARIADNE_TASK_ATTEMPTS_SERIES = ' sum(increase(ariadne_task_runs_total[5m])) '
ARIADNE_TASK_FAILURES_SERIES = ' sum(increase(ariadne_task_runs_total { status= " error " }[5m])) '
2026-01-21 14:30:55 -03:00
ARIADNE_TASK_WARNINGS_SERIES = (
' sum(increase(ariadne_task_runs_total { status!~ " ok|error " }[$__interval])) or on() vector(0) '
)
2026-04-19 14:18:41 -03:00
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = f " (time() - { ARIADNE_ALL_SCHEDULE_LAST_SUCCESS } ) / 3600 "
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = f " (time() - { ARIADNE_ALL_SCHEDULE_LAST_ERROR } ) / 3600 "
2026-01-21 14:30:55 -03:00
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
2026-04-19 14:18:41 -03:00
f " (time() - max_over_time( { ARIADNE_ALL_SCHEDULE_LAST_SUCCESS } [$__range])) / 3600 "
2026-01-21 14:30:55 -03:00
)
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
2026-04-19 14:18:41 -03:00
f " (time() - max_over_time( { ARIADNE_ALL_SCHEDULE_LAST_ERROR } [$__range])) / 3600 "
2026-04-13 14:29:44 -03:00
)
2026-04-19 14:18:41 -03:00
ARIADNE_FAST_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
f " (time() - max_over_time( { ARIADNE_FAST_SCHEDULE_LAST_SUCCESS } [$__range])) / 3600 "
2026-04-12 20:05:39 -03:00
)
2026-04-19 14:18:41 -03:00
ARIADNE_FAST_SCHEDULE_NEXT_RUN_HOURS = f " (( { ARIADNE_ALL_SCHEDULE_NEXT_RUN } - time()) / 3600) "
2026-01-19 16:58:02 -03:00
ARIADNE_ACCESS_REQUESTS = " ariadne_access_requests_total "
2026-04-10 15:35:20 -03:00
PLATFORM_TEST_SUITE_NAMES = [
" ariadne " ,
" metis " ,
" ananke " ,
" atlasbot " ,
" pegasus " ,
" soteria " ,
2026-04-19 14:18:41 -03:00
" titan_iac " ,
" bstein_home " ,
" data_prepper " ,
2026-04-10 15:35:20 -03:00
]
2026-04-18 17:47:06 -03:00
PLATFORM_TEST_SUCCESS_STATUS = " ok|passed|success "
2026-04-21 22:09:06 -03:00
PLATFORM_TEST_NON_FAILURE_STATUS = f " { PLATFORM_TEST_SUCCESS_STATUS } |not_applicable|skipped|na|n/a "
2026-04-19 14:18:41 -03:00
PLATFORM_TEST_CI_JOB = " platform-quality-ci "
PLATFORM_TEST_EXPORT_FILTER = f ' exported_job= " { PLATFORM_TEST_CI_JOB } " '
2026-04-18 17:47:06 -03:00
PLATFORM_TEST_SUITE_VALUE_BY_NAME = {
" ariadne " : " ariadne " ,
" metis " : " metis " ,
" ananke " : " ananke " ,
" atlasbot " : " atlasbot " ,
" pegasus " : " pegasus|pegasus-health|pegasus_health " ,
" soteria " : " soteria " ,
2026-04-19 14:18:41 -03:00
" titan_iac " : " titan_iac|titan-iac " ,
" bstein_home " : " bstein_home|bstein-home " ,
" data_prepper " : " data_prepper|data-prepper " ,
2026-04-18 17:47:06 -03:00
}
2026-04-20 13:45:01 -03:00
PLATFORM_TEST_JENKINS_JOB_BY_SUITE = {
" ariadne " : " ariadne " ,
" metis " : " metis " ,
" ananke " : " ananke " ,
" atlasbot " : " atlasbot " ,
" pegasus " : " pegasus " ,
" soteria " : " Soteria " ,
" titan_iac " : " titan-iac " ,
" bstein_home " : " bstein-dev-home " ,
" data_prepper " : " data-prepper " ,
}
JENKINS_UI_BASE_DEFAULT = " https://ci.bstein.dev "
2026-04-18 17:47:06 -03:00
PLATFORM_TEST_SUITE_MATCHER = " | " . join (
PLATFORM_TEST_SUITE_VALUE_BY_NAME . get ( suite , suite ) for suite in PLATFORM_TEST_SUITE_NAMES
)
2026-04-19 14:18:41 -03:00
PLATFORM_TEST_SUITE_CANONICAL_MATCHER = " | " . join ( PLATFORM_TEST_SUITE_NAMES )
2026-04-22 14:34:40 -03:00
PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER = PLATFORM_TEST_SUITE_CANONICAL_MATCHER
2026-04-04 01:33:15 -03:00
PLATFORM_TEST_SUCCESS_EVENTS_30D = (
2026-04-19 14:18:41 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status=~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} [30d])) or on() vector(0)) '
2026-04-04 01:33:15 -03:00
)
PLATFORM_TEST_TOTAL_EVENTS_30D = (
2026-04-19 14:18:41 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [30d])) or on() vector(0)) '
)
PLATFORM_TEST_SUCCESS_EVENTS_7D = (
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status=~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} [7d])) or on() vector(0)) '
)
PLATFORM_TEST_TOTAL_EVENTS_7D = (
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [7d])) or on() vector(0)) '
)
PLATFORM_TEST_SUCCESS_EVENTS_24H = (
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status=~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h])) or on() vector(0)) '
)
PLATFORM_TEST_TOTAL_EVENTS_24H = (
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h])) or on() vector(0)) '
2026-04-04 01:33:15 -03:00
)
2026-03-31 13:54:04 -03:00
TEST_SUCCESS_RATE = (
2026-04-04 01:33:15 -03:00
f " 100 * ( { PLATFORM_TEST_SUCCESS_EVENTS_30D } ) / clamp_min(( { PLATFORM_TEST_TOTAL_EVENTS_30D } ), 1) "
2026-01-21 13:37:36 -03:00
)
2026-04-19 14:18:41 -03:00
TEST_SUCCESS_RATE_7D = (
f " 100 * ( { PLATFORM_TEST_SUCCESS_EVENTS_7D } ) / clamp_min(( { PLATFORM_TEST_TOTAL_EVENTS_7D } ), 1) "
)
TEST_SUCCESS_RATE_24H = (
f " 100 * ( { PLATFORM_TEST_SUCCESS_EVENTS_24H } ) / clamp_min(( { PLATFORM_TEST_TOTAL_EVENTS_24H } ), 1) "
)
2026-04-04 01:33:15 -03:00
TEST_FAILURES_24H_TOTAL = (
2026-04-19 14:18:41 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status!~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h])) or on() vector(0)) '
2026-04-04 01:33:15 -03:00
)
2026-04-09 19:27:48 -03:00
PLATFORM_TEST_FAILURES_24H_BY_SUITE = (
2026-04-19 14:18:41 -03:00
f ' sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status!~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h]))) '
2026-04-09 19:27:48 -03:00
)
2026-04-04 01:33:15 -03:00
PLATFORM_TEST_ACTIVITY_30D = (
2026-04-19 14:18:41 -03:00
f ' sum by (suite, status) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [30d])) '
)
PLATFORM_TEST_RUNS_24H_TOTAL = PLATFORM_TEST_TOTAL_EVENTS_24H
PLATFORM_TEST_ACTIVE_SUITES_24H = (
f ' sum((sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h])) > 0)) '
" or on() vector(0) "
2026-01-21 13:37:36 -03:00
)
2026-04-09 19:27:48 -03:00
PLATFORM_TEST_POINT_WINDOW = " 1h "
2026-04-10 15:35:20 -03:00
PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS = [
2026-04-09 16:16:35 -03:00
{
2026-04-10 15:35:20 -03:00
" refId " : chr ( ord ( " A " ) + index ) ,
2026-04-09 16:16:35 -03:00
" expr " : (
2026-04-19 14:18:41 -03:00
f ' (100 * (sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_VALUE_BY_NAME . get ( suite , suite ) } " ,status=~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} '
2026-04-09 16:35:14 -03:00
f ' [ { PLATFORM_TEST_POINT_WINDOW } ]))) / '
2026-04-19 14:18:41 -03:00
f ' clamp_min((sum(increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_VALUE_BY_NAME . get ( suite , suite ) } " , { PLATFORM_TEST_EXPORT_FILTER } }} [ { PLATFORM_TEST_POINT_WINDOW } ]))), 1)) '
2026-04-09 16:16:35 -03:00
) ,
" legendFormat " : suite ,
}
2026-04-10 15:35:20 -03:00
for index , suite in enumerate ( PLATFORM_TEST_SUITE_NAMES )
2026-04-09 16:16:35 -03:00
]
2026-04-09 20:16:44 -03:00
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
2026-04-19 14:18:41 -03:00
f ' sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " ,status=~ " { PLATFORM_TEST_SUCCESS_STATUS } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h]))) '
f ' / clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [24h]))), 1)) '
2026-04-09 20:16:44 -03:00
)
2026-04-12 22:58:21 -03:00
QUALITY_GATE_SUITE_INDEX_30D = (
2026-04-19 14:18:41 -03:00
f ' sum by (suite) (increase(platform_quality_gate_runs_total {{ suite=~ " { PLATFORM_TEST_SUITE_MATCHER } " , { PLATFORM_TEST_EXPORT_FILTER } }} [30d])) '
2026-04-12 22:58:21 -03:00
)
QUALITY_GATE_COVERAGE_BY_SUITE = (
2026-04-19 14:18:41 -03:00
f ' (max by (suite) ( {{ __name__=~ " .*_quality_gate_coverage_percent " , { PLATFORM_TEST_EXPORT_FILTER } }} )) '
f ' or on(suite) (max by (suite) (platform_quality_gate_workspace_line_coverage_percent {{ { PLATFORM_TEST_EXPORT_FILTER } }} )) '
2026-04-12 22:58:21 -03:00
)
QUALITY_GATE_COVERAGE_BY_SUITE_WITH_MISSING = (
f " ( { QUALITY_GATE_COVERAGE_BY_SUITE } ) or on(suite) (0 * ( { QUALITY_GATE_SUITE_INDEX_30D } ) - 1) "
)
QUALITY_GATE_COVERAGE_GAP_BY_SUITE = (
f " clamp_min(95 - ( { QUALITY_GATE_COVERAGE_BY_SUITE } ), 0) "
)
QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE = (
2026-04-19 14:18:41 -03:00
f " max by (suite) (platform_quality_gate_source_lines_over_500_total {{ { PLATFORM_TEST_EXPORT_FILTER } }} ) "
2026-04-12 22:58:21 -03:00
)
QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE_WITH_MISSING = (
f " ( { QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE } ) or on(suite) (0 * ( { QUALITY_GATE_SUITE_INDEX_30D } ) - 1) "
)
2026-05-15 22:07:41 -03:00
PLATFORM_TEST_CHECKS_SELECTOR = (
f ' __name__=~ " .*_quality_gate_checks_total " ,suite=~ " { PLATFORM_TEST_SUITE_CANONICAL_MATCHER } " , '
f " { PLATFORM_TEST_EXPORT_FILTER } "
)
PLATFORM_TEST_CURRENT_GATE_HEALTH_BY_SUITE = (
f ' (100 * sum by (suite) (max by (suite, check) '
f ' (( {{ { PLATFORM_TEST_CHECKS_SELECTOR } ,result=~ " { PLATFORM_TEST_NON_FAILURE_STATUS } " }} > bool 0))) '
f ' / clamp_min(sum by (suite) (max by (suite, check) '
f ' (( {{ { PLATFORM_TEST_CHECKS_SELECTOR } }} > bool 0))), 1)) '
)
2026-04-19 14:18:41 -03:00
PVC_BACKUP_AGE_HOURS_BY_PVC = " sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999))) "
2026-04-08 23:33:17 -03:00
ANANKE_SELECTOR = ' job= " ananke-power " '
ANANKE_UPS_DB_NAME = " Pyrphoros "
ANANKE_UPS_DB_NODE = " titan-db "
ANANKE_UPS_TETHYS_NAME = " Statera "
ANANKE_UPS_TETHYS_NODE = " titan-24 "
ANANKE_UPS_DB_SELECTOR = f ' { ANANKE_SELECTOR } ,source= " { ANANKE_UPS_DB_NAME } " '
ANANKE_UPS_TETHYS_SELECTOR = f ' { ANANKE_SELECTOR } ,source= " { ANANKE_UPS_TETHYS_NAME } " '
ANANKE_UPS_ON_BATTERY = f " sum(ananke_ups_on_battery {{ { ANANKE_SELECTOR } }} ) or on() vector(0) "
ANANKE_UPS_LOW_BATTERY = f " sum(ananke_ups_low_battery {{ { ANANKE_SELECTOR } }} ) or on() vector(0) "
ANANKE_UPS_RUNTIME_MIN = f " min(ananke_ups_runtime_seconds {{ { ANANKE_SELECTOR } }} ) or on() vector(0) "
ANANKE_UPS_RUNTIME_HEADROOM_PERCENT = (
f " 100 * min(ananke_ups_runtime_seconds {{ { ANANKE_SELECTOR } }} ) / "
f " clamp_min(max(ananke_ups_threshold_seconds {{ { ANANKE_SELECTOR } }} ), 1) "
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_TRIGGER_COUNT_1D = f " increase(ananke_shutdown_triggers_total {{ { ANANKE_SELECTOR } }} [1d]) or on() vector(0) "
2026-05-15 19:37:03 -03:00
GITOPS_SELECTOR = ANANKE_SELECTOR
GITOPS_SOURCE_INFO = (
f ' max by (branch, revision) (ananke_gitops_flux_source_info {{ { GITOPS_SELECTOR } ,namespace= " flux-system " ,name= " flux-system " }} ) '
)
GITOPS_KUSTOMIZATION_READY_PCT = (
f " 100 * sum(max by (namespace, name) (ananke_gitops_kustomization_ready {{ { GITOPS_SELECTOR } }} )) "
f " / clamp_min(count(max by (namespace, name) (ananke_gitops_kustomization_ready {{ { GITOPS_SELECTOR } }} )), 1) "
)
2026-05-15 22:07:41 -03:00
GITOPS_KUSTOMIZATION_READY_COUNT = (
f " sum(max by (namespace, name) (ananke_gitops_kustomization_ready {{ { GITOPS_SELECTOR } }} )) or on() vector(0) "
)
GITOPS_KUSTOMIZATION_TOTAL_COUNT = (
f " count(max by (namespace, name) (ananke_gitops_kustomization_ready {{ { GITOPS_SELECTOR } }} )) or on() vector(0) "
)
2026-05-15 19:37:03 -03:00
GITOPS_KUSTOMIZATION_SUSPENDED = (
f " sum(max by (namespace, name) (ananke_gitops_kustomization_suspended {{ { GITOPS_SELECTOR } }} )) or on() vector(0) "
)
2026-05-16 02:21:05 -03:00
GITOPS_KUSTOMIZATION_NOT_SUSPENDED_PCT = (
f " 100 * (1 - ( { GITOPS_KUSTOMIZATION_SUSPENDED } ) / clamp_min(( { GITOPS_KUSTOMIZATION_TOTAL_COUNT } ), 1)) "
)
2026-05-15 19:37:03 -03:00
GITOPS_HELM_READY_PCT = (
f " 100 * sum(max by (namespace, name) (ananke_gitops_helmrelease_ready {{ { GITOPS_SELECTOR } }} )) "
f " / clamp_min(count(max by (namespace, name) (ananke_gitops_helmrelease_ready {{ { GITOPS_SELECTOR } }} )), 1) "
)
2026-05-15 22:07:41 -03:00
GITOPS_HELM_READY_COUNT = (
f " sum(max by (namespace, name) (ananke_gitops_helmrelease_ready {{ { GITOPS_SELECTOR } }} )) or on() vector(0) "
)
GITOPS_HELM_TOTAL_COUNT = (
f " count(max by (namespace, name) (ananke_gitops_helmrelease_ready {{ { GITOPS_SELECTOR } }} )) or on() vector(0) "
)
2026-05-15 19:37:03 -03:00
GITOPS_HELM_SUSPENDED = (
f " sum(max by (namespace, name) (ananke_gitops_helmrelease_suspended {{ { GITOPS_SELECTOR } }} )) or on() vector(0) "
)
2026-05-16 02:21:05 -03:00
GITOPS_HELM_NOT_SUSPENDED_PCT = (
f " 100 * (1 - ( { GITOPS_HELM_SUSPENDED } ) / clamp_min(( { GITOPS_HELM_TOTAL_COUNT } ), 1)) "
2026-05-15 22:43:44 -03:00
)
2026-05-15 19:37:03 -03:00
GITOPS_SCRAPE_SUCCESS = f " min(ananke_gitops_scrape_success {{ { GITOPS_SELECTOR } }} ) or on() vector(0) "
GITOPS_LAST_SCRAPE_AGE = (
f " (time() - max(ananke_gitops_last_scrape_timestamp_seconds {{ { GITOPS_SELECTOR } }} )) or on() vector(0) "
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_RUNTIME_DB = (
f ' max(ananke_ups_runtime_seconds {{ { ANANKE_UPS_DB_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_RUNTIME_TETHYS = (
f ' max(ananke_ups_runtime_seconds {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_ON_BATTERY_DB = (
f ' max(ananke_ups_on_battery {{ { ANANKE_UPS_DB_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_ON_BATTERY_TETHYS = (
f ' max(ananke_ups_on_battery {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_BATTERY_CHARGE_DB = (
f ' max(ananke_ups_battery_charge_percent {{ { ANANKE_UPS_DB_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_BATTERY_CHARGE_TETHYS = (
f ' max(ananke_ups_battery_charge_percent {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_LOAD_DB = (
f ' max(ananke_ups_load_percent {{ { ANANKE_UPS_DB_SELECTOR } }} ) or on() vector(0) '
2026-04-03 17:49:09 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_LOAD_TETHYS = (
f ' max(ananke_ups_load_percent {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_DRAW_WATTS_DB = (
f ' max((ananke_ups_load_percent {{ { ANANKE_UPS_DB_SELECTOR } }} '
f ' * ananke_ups_power_nominal_watts {{ { ANANKE_UPS_DB_SELECTOR } }} ) / 100) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_DRAW_WATTS_TETHYS = (
f ' max((ananke_ups_load_percent {{ { ANANKE_UPS_TETHYS_SELECTOR } }} '
f ' * ananke_ups_power_nominal_watts {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) / 100) or on() vector(0) '
)
ANANKE_UPS_DRAW_WATTS_DB_SERIES = (
f ' ((ananke_ups_load_percent {{ { ANANKE_UPS_DB_SELECTOR } }} '
f ' * ananke_ups_power_nominal_watts {{ { ANANKE_UPS_DB_SELECTOR } }} ) / 100) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES = (
f ' ((ananke_ups_load_percent {{ { ANANKE_UPS_TETHYS_SELECTOR } }} '
f ' * ananke_ups_power_nominal_watts {{ { ANANKE_UPS_TETHYS_SELECTOR } }} ) / 100) '
2026-04-03 20:45:40 -03:00
)
2026-04-08 23:33:17 -03:00
ANANKE_UPS_RUNTIME_BY_SOURCE = f " ananke_ups_runtime_seconds {{ { ANANKE_SELECTOR } }} "
ANANKE_UPS_LOAD_BY_SOURCE = f " ananke_ups_load_percent {{ { ANANKE_SELECTOR } }} "
ANANKE_UPS_CHARGE_BY_SOURCE = f " ananke_ups_battery_charge_percent {{ { ANANKE_SELECTOR } }} "
ANANKE_UPS_TRIGGER_BY_SOURCE = f " ananke_ups_trigger_active {{ { ANANKE_SELECTOR } }} "
2026-04-12 17:20:05 -03:00
CLIMATE_SENSOR_COUNT = " count(typhon_temperature_celsius) or on() vector(0) "
2026-04-19 14:18:41 -03:00
CLIMATE_TEMP_MAX = " max(typhon_temperature_celsius) or on() vector(0) "
CLIMATE_PRESSURE_CURRENT = " max(typhon_vpd_kpa) or on() vector(0) "
CLIMATE_HUMIDITY_MAX = " max(typhon_relative_humidity_percent) or on() vector(0) "
CLIMATE_TEMP_SERIES = " typhon_temperature_celsius "
CLIMATE_PRESSURE_SERIES = " typhon_vpd_kpa "
CLIMATE_HUMIDITY_SERIES = " typhon_relative_humidity_percent "
CLIMATE_DEWPOINT_SERIES = (
" (243.12 * (ln(clamp_min(typhon_relative_humidity_percent, 1) / 100) + "
" (17.62 * typhon_temperature_celsius) / (243.12 + typhon_temperature_celsius))) / "
" (17.62 - (ln(clamp_min(typhon_relative_humidity_percent, 1) / 100) + "
" (17.62 * typhon_temperature_celsius) / (243.12 + typhon_temperature_celsius))) "
)
CLIMATE_DEWPOINT_CURRENT = f " max( { CLIMATE_DEWPOINT_SERIES } ) or on() vector(0) "
2026-04-03 20:45:40 -03:00
CLIMATE_FAN_OUTLET_CURRENT = (
2026-04-19 14:18:41 -03:00
' max(typhon_fan_speed_level { fan_group= " outlet " }) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_INSIDE_INLET_CURRENT = (
2026-04-19 14:18:41 -03:00
' max(typhon_fan_speed_level { fan_group= " inside_inlet " }) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_OUTSIDE_INLET_CURRENT = (
2026-04-19 14:18:41 -03:00
' max(typhon_fan_speed_level { fan_group= " outside_inlet " }) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_INTERIOR_CURRENT = (
2026-04-19 14:18:41 -03:00
' max(typhon_fan_speed_level { fan_group= " interior " }) or on() vector(0) '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_OUTLET_SERIES = (
2026-04-19 14:18:41 -03:00
' typhon_fan_speed_level { fan_group= " outlet " } '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_INSIDE_INLET_SERIES = (
2026-04-19 14:18:41 -03:00
' typhon_fan_speed_level { fan_group= " inside_inlet " } '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_OUTSIDE_INLET_SERIES = (
2026-04-19 14:18:41 -03:00
' typhon_fan_speed_level { fan_group= " outside_inlet " } '
2026-04-03 20:45:40 -03:00
)
CLIMATE_FAN_INTERIOR_SERIES = (
2026-04-19 14:18:41 -03:00
' typhon_fan_speed_level { fan_group= " interior " } '
2026-04-12 22:07:58 -03:00
)
2026-01-22 18:23:17 -03:00
POSTGRES_CONN_USED = (
' label_replace(sum(pg_stat_activity_count), " conn " , " used " , " __name__ " , " .* " ) '
' or label_replace(max(pg_settings_max_connections), " conn " , " max " , " __name__ " , " .* " ) '
2026-01-22 15:23:23 -03:00
)
POSTGRES_CONN_HOTTEST = ' topk(1, sum by (datname) (pg_stat_activity_count)) '
2026-01-21 13:37:36 -03:00
ONEOFF_JOB_OWNER = (
' label_replace(kube_job_owner { owner_kind= " CronJob " }, " owner_name " , " $1 " , " job_name " , " (.*) " ) '
)
ONEOFF_JOB_PODS = f ' (kube_pod_owner {{ owner_kind= " Job " }} unless on(namespace, owner_name) { ONEOFF_JOB_OWNER } ) '
ONEOFF_JOB_POD_AGE_HOURS = (
' ((time() - kube_pod_start_time { pod!= " " }) / 3600) '
f ' * on(namespace,pod) group_left(owner_name) { ONEOFF_JOB_PODS } '
' * on(namespace,pod) group_left(phase) '
' max by (namespace,pod,phase) (kube_pod_status_phase { phase=~ " Running|Succeeded " }) '
)
2025-11-18 10:47:24 -03:00
GPU_NODES = [ " titan-20 " , " titan-21 " , " titan-22 " , " titan-24 " ]
GPU_NODE_REGEX = " | " . join ( GPU_NODES )
2025-11-17 18:55:11 -03:00
TRAEFIK_ROUTER_EXPR = " sum by (router) (rate(traefik_router_requests_total[5m])) "
2025-11-18 14:08:33 -03:00
TRAEFIK_NET_INGRESS = (
' sum(rate(container_network_receive_bytes_total { namespace= " traefik " ,pod=~ " traefik-.* " }[5m])) '
2025-11-18 11:30:33 -03:00
" or on() vector(0) "
)
2025-11-18 14:08:33 -03:00
TRAEFIK_NET_EGRESS = (
' sum(rate(container_network_transmit_bytes_total { namespace= " traefik " ,pod=~ " traefik-.* " }[5m])) '
" or on() vector(0) "
)
2025-11-18 15:55:24 -03:00
NET_CLUSTER_RX = (
' sum(rate(container_network_receive_bytes_total { namespace!= " " ,pod!= " " ,container!= " " }[5m])) '
" or on() vector(0) "
)
NET_CLUSTER_TX = (
2025-11-18 11:30:33 -03:00
' sum(rate(container_network_transmit_bytes_total { namespace!= " " ,pod!= " " ,container!= " " }[5m])) '
" or on() vector(0) "
)
2025-11-18 16:18:52 -03:00
PHYSICAL_NET_FILTER = ' device!~ " lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.* " '
NET_NODE_RX_PHYS = (
f ' sum(rate(node_network_receive_bytes_total {{ { PHYSICAL_NET_FILTER } }} [5m])) or on() vector(0) '
)
NET_NODE_TX_PHYS = (
f ' sum(rate(node_network_transmit_bytes_total {{ { PHYSICAL_NET_FILTER } }} [5m])) or on() vector(0) '
)
NET_TOTAL_EXPR = NET_NODE_TX_PHYS
NET_INGRESS_EXPR = NET_NODE_RX_PHYS
NET_EGRESS_EXPR = NET_NODE_TX_PHYS
NET_INTERNAL_EXPR = (
2025-11-18 17:09:13 -03:00
' sum(rate(container_network_receive_bytes_total { namespace!= " traefik " ,pod!= " " }[5m]) '
' + rate(container_network_transmit_bytes_total { namespace!= " traefik " ,pod!= " " }[5m])) '
2025-11-18 16:18:52 -03:00
' or on() vector(0) '
)
2025-12-12 18:00:43 -03:00
APISERVER_5XX_RATE = ' sum(rate(apiserver_request_total { code=~ " 5.. " }[5m])) '
APISERVER_P99_LATENCY_MS = (
" histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000 "
)
ETCD_P99_LATENCY_MS = (
" histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000 "
)
TRAEFIK_TOTAL_5M = " sum(rate(traefik_entrypoint_requests_total[5m])) "
TRAEFIK_SUCCESS_5M = ' sum(rate(traefik_entrypoint_requests_total { code!~ " 5.. " }[5m])) '
TRAEFIK_SLI_5M = f " ( { TRAEFIK_SUCCESS_5M } ) / clamp_min( { TRAEFIK_TOTAL_5M } , 1) "
TRAEFIK_P99_LATENCY_MS = (
" histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000 "
)
TRAEFIK_P95_LATENCY_MS = (
" histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000 "
)
SLO_AVAILABILITY = 0.999
def traefik_sli ( window ) :
total = f ' sum(rate(traefik_entrypoint_requests_total[ { window } ])) '
success = f ' sum(rate(traefik_entrypoint_requests_total {{ code!~ " 5.. " }} [ { window } ])) '
return f " ( { success } ) / clamp_min( { total } , 1) "
def traefik_burn ( window ) :
sli = traefik_sli ( window )
return f " (1 - ( { sli } )) / { 1 - SLO_AVAILABILITY } "
2025-11-17 16:27:38 -03:00
# ---------------------------------------------------------------------------
# Panel factories
# ---------------------------------------------------------------------------
2025-11-17 14:22:46 -03:00
2025-11-17 16:27:38 -03:00
def stat_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " none " ,
2025-12-12 16:15:37 -03:00
decimals = None ,
2025-11-17 16:27:38 -03:00
thresholds = None ,
text_mode = " value " ,
legend = None ,
2025-11-17 19:38:40 -03:00
instant = False ,
2025-11-17 16:27:38 -03:00
value_suffix = None ,
links = None ,
2026-04-03 20:45:40 -03:00
targets = None ,
field_overrides = None ,
description = None ,
2026-04-04 04:34:18 -03:00
orientation = None ,
2026-04-04 04:40:22 -03:00
wide_layout = None ,
2025-11-17 16:27:38 -03:00
) :
""" Return a Grafana stat panel definition. """
2025-11-17 14:22:46 -03:00
defaults = {
2025-12-12 20:44:20 -03:00
" color " : { " mode " : " thresholds " } ,
2025-11-17 14:22:46 -03:00
" mappings " : [ ] ,
" thresholds " : thresholds
or {
" mode " : " absolute " ,
" steps " : [
{ " color " : " rgba(115, 115, 115, 1) " , " value " : None } ,
{ " color " : " green " , " value " : 1 } ,
] ,
} ,
" unit " : unit ,
2025-11-17 16:27:38 -03:00
" custom " : { " displayMode " : " auto " } ,
2025-11-17 14:22:46 -03:00
}
2025-11-17 16:27:38 -03:00
if value_suffix :
defaults [ " custom " ] [ " valueSuffix " ] = value_suffix
2025-12-12 16:15:37 -03:00
if decimals is not None :
defaults [ " decimals " ] = decimals
2026-04-03 20:45:40 -03:00
target_list = targets if targets is not None else [ { " expr " : expr , " refId " : " A " } ]
2025-11-17 14:22:46 -03:00
panel = {
" id " : panel_id ,
" type " : " stat " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2026-04-03 20:45:40 -03:00
" targets " : target_list ,
" fieldConfig " : { " defaults " : defaults , " overrides " : field_overrides or [ ] } ,
2025-11-17 14:22:46 -03:00
" options " : {
" colorMode " : " value " ,
2026-04-19 14:18:41 -03:00
" graphMode " : " area " ,
" justifyMode " : " center " ,
2025-11-17 14:22:46 -03:00
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" textMode " : text_mode ,
} ,
}
2026-04-04 04:34:18 -03:00
if orientation :
panel [ " options " ] [ " orientation " ] = orientation
2026-04-04 04:40:22 -03:00
if wide_layout is not None :
panel [ " options " ] [ " wideLayout " ] = wide_layout
2026-04-03 20:45:40 -03:00
if legend and len ( panel [ " targets " ] ) == 1 :
2025-11-17 14:22:46 -03:00
panel [ " targets " ] [ 0 ] [ " legendFormat " ] = legend
2025-11-17 19:38:40 -03:00
if instant :
2026-04-03 20:45:40 -03:00
for t in panel [ " targets " ] :
t . setdefault ( " instant " , True )
2025-11-17 16:27:38 -03:00
if links :
panel [ " links " ] = links
2026-04-03 20:45:40 -03:00
if description :
panel [ " description " ] = description
2025-11-17 14:22:46 -03:00
return panel
2025-11-18 12:11:47 -03:00
def gauge_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
min_value = 0 ,
max_value = 1 ,
thresholds = None ,
links = None ,
) :
return {
" id " : panel_id ,
" type " : " gauge " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
" targets " : [ { " expr " : expr , " refId " : " A " } ] ,
" fieldConfig " : {
" defaults " : {
" min " : min_value ,
" max " : max_value ,
" thresholds " : thresholds
or {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : max_value } ,
] ,
} ,
} ,
" overrides " : [ ] ,
} ,
" options " : {
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" orientation " : " auto " ,
" showThresholdMarkers " : False ,
" showThresholdLabels " : False ,
} ,
* * ( { " links " : links } if links else { } ) ,
}
2025-11-17 16:27:38 -03:00
def timeseries_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " none " ,
2026-01-21 15:01:02 -03:00
max_value = None ,
2025-11-17 16:27:38 -03:00
legend = None ,
legend_display = " table " ,
legend_placement = " bottom " ,
legend_calcs = None ,
time_from = None ,
links = None ,
2026-04-03 20:45:40 -03:00
targets = None ,
field_overrides = None ,
description = None ,
2026-04-21 11:39:13 -03:00
data_links = None ,
2025-11-17 16:27:38 -03:00
) :
""" Return a Grafana time-series panel definition. """
2026-04-03 20:45:40 -03:00
target_list = targets if targets is not None else [ { " expr " : expr , " refId " : " A " } ]
2025-11-17 14:22:46 -03:00
panel = {
" id " : panel_id ,
" type " : " timeseries " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2026-04-03 20:45:40 -03:00
" targets " : target_list ,
" fieldConfig " : { " defaults " : { " unit " : unit } , " overrides " : field_overrides or [ ] } ,
2025-11-17 14:22:46 -03:00
" options " : {
" legend " : {
" displayMode " : legend_display ,
" placement " : legend_placement ,
} ,
" tooltip " : { " mode " : " multi " } ,
} ,
}
2026-01-21 15:01:02 -03:00
if max_value is not None :
panel [ " fieldConfig " ] [ " defaults " ] [ " max " ] = max_value
2026-04-03 20:45:40 -03:00
if legend and len ( panel [ " targets " ] ) == 1 :
2025-11-17 14:22:46 -03:00
panel [ " targets " ] [ 0 ] [ " legendFormat " ] = legend
if legend_calcs :
panel [ " options " ] [ " legend " ] [ " calcs " ] = legend_calcs
if time_from :
panel [ " timeFrom " ] = time_from
2025-11-17 16:27:38 -03:00
if links :
panel [ " links " ] = links
2026-04-21 11:39:13 -03:00
if data_links :
panel [ " fieldConfig " ] [ " defaults " ] [ " links " ] = data_links
2026-04-03 20:45:40 -03:00
if description :
panel [ " description " ] = description
2025-11-17 14:22:46 -03:00
return panel
2026-05-15 22:07:41 -03:00
def state_timeline_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
description ,
thresholds ,
unit = " percent " ,
min_value = 0 ,
max_value = 100 ,
legend = " {{ suite}} " ,
links = None ,
data_links = None ,
) :
""" Return a lane-style state timeline panel for categorical health over time. """
defaults = {
" color " : { " mode " : " thresholds " } ,
" unit " : unit ,
" thresholds " : thresholds ,
" custom " : {
" fillOpacity " : 70 ,
" lineWidth " : 0 ,
" spanNulls " : True ,
} ,
}
if min_value is not None :
defaults [ " min " ] = min_value
if max_value is not None :
defaults [ " max " ] = max_value
panel = {
" id " : panel_id ,
" type " : " state-timeline " ,
" title " : title ,
" description " : description ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
" targets " : [ { " expr " : expr , " refId " : " A " , " legendFormat " : legend } ] ,
" fieldConfig " : { " defaults " : defaults , " overrides " : [ ] } ,
" options " : {
" mergeValues " : True ,
" showValue " : " never " ,
" legend " : { " displayMode " : " list " , " placement " : " bottom " } ,
" tooltip " : { " mode " : " single " , " sort " : " none " } ,
} ,
}
if links :
panel [ " links " ] = links
if data_links :
panel [ " fieldConfig " ] [ " defaults " ] [ " links " ] = data_links
return panel
def apply_bar_timeseries_style ( panel , * , stacked = False , fill_opacity = 70 ) :
""" Make a time-series panel read as volume bars instead of interpolated lines. """
panel [ " fieldConfig " ] [ " defaults " ] [ " custom " ] = {
" drawStyle " : " bars " ,
" barAlignment " : 0 ,
2026-05-16 05:08:09 -03:00
" barWidthFactor " : 0.72 ,
2026-05-15 22:07:41 -03:00
" lineWidth " : 0 ,
" fillOpacity " : fill_opacity ,
2026-05-15 22:43:44 -03:00
" gradientMode " : " none " ,
" showPoints " : " never " ,
2026-05-15 22:07:41 -03:00
" spanNulls " : True ,
}
if stacked :
panel [ " fieldConfig " ] [ " defaults " ] [ " custom " ] [ " stacking " ] = { " mode " : " normal " , " group " : " A " }
return panel
2026-05-15 22:43:44 -03:00
def fixed_color_overrides ( series_colors ) :
""" Return fixed-color overrides keyed by exact series name. """
return [
{
" matcher " : { " id " : " byName " , " options " : series_name } ,
" properties " : [ { " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : color } } ] ,
}
for series_name , color in series_colors . items ( )
]
2025-11-17 16:27:38 -03:00
def table_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " none " ,
transformations = None ,
2025-12-13 04:00:57 -03:00
instant = False ,
2025-12-13 17:32:19 -03:00
options = None ,
2025-12-13 17:55:52 -03:00
filterable = True ,
2025-12-13 18:03:51 -03:00
footer = None ,
2025-12-13 18:23:19 -03:00
format = None ,
2026-04-12 19:46:39 -03:00
description = None ,
2026-05-15 19:37:03 -03:00
field_overrides = None ,
2026-05-15 22:07:41 -03:00
links = None ,
2025-11-17 16:27:38 -03:00
) :
""" Return a Grafana table panel definition. """
2025-12-13 11:53:27 -03:00
# Optional PromQL subquery helpers in expr: share(), etc.
2025-12-13 17:35:52 -03:00
panel_options = { " showHeader " : True , " columnFilters " : False }
2025-12-13 17:32:19 -03:00
if options :
panel_options . update ( options )
2025-12-13 18:03:51 -03:00
if footer is not None :
panel_options [ " footer " ] = footer
2025-12-13 17:55:52 -03:00
field_defaults = { " unit " : unit , " custom " : { " filterable " : filterable } }
2026-04-19 14:18:41 -03:00
target = { " expr " : expr , " refId " : " A " , * * ( { " instant " : True } if instant else { } ) }
2025-12-13 18:23:19 -03:00
if format :
2026-04-19 14:18:41 -03:00
target [ " format " ] = format
2025-11-17 14:22:46 -03:00
panel = {
" id " : panel_id ,
" type " : " table " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2026-04-19 14:18:41 -03:00
" targets " : [ target ] ,
2026-05-15 19:37:03 -03:00
" fieldConfig " : { " defaults " : field_defaults , " overrides " : field_overrides or [ ] } ,
2025-12-13 17:32:19 -03:00
" options " : panel_options ,
2025-11-17 14:22:46 -03:00
}
if transformations :
panel [ " transformations " ] = transformations
2026-04-12 19:46:39 -03:00
if description :
panel [ " description " ] = description
2026-05-15 22:07:41 -03:00
if links :
panel [ " links " ] = links
2025-11-17 14:22:46 -03:00
return panel
2026-04-19 14:18:41 -03:00
def pie_panel ( panel_id , title , expr , grid , * , links = None , description = None ) :
2025-11-17 16:27:38 -03:00
""" Return a pie chart panel with readable namespace labels. """
2026-01-01 14:44:33 -03:00
panel = {
2025-11-17 14:22:46 -03:00
" id " : panel_id ,
" type " : " piechart " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2026-04-19 14:18:41 -03:00
" targets " : [ { " expr " : expr , " refId " : " A " , " legendFormat " : " {{ namespace}} " } ] ,
2025-11-18 14:08:33 -03:00
" fieldConfig " : {
" defaults " : {
2026-04-19 14:18:41 -03:00
" unit " : " percent " ,
2025-11-18 14:08:33 -03:00
" color " : { " mode " : " palette-classic " } ,
} ,
" overrides " : [ ] ,
} ,
2025-11-17 14:22:46 -03:00
" options " : {
" legend " : { " displayMode " : " list " , " placement " : " right " } ,
" pieType " : " pie " ,
2025-12-12 20:40:32 -03:00
" displayLabels " : [ ] ,
2025-11-18 14:08:33 -03:00
" tooltip " : { " mode " : " single " } ,
" colorScheme " : " interpolateSpectral " ,
" colorBy " : " value " ,
2025-11-17 14:22:46 -03:00
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
} ,
}
2026-01-01 14:44:33 -03:00
if links :
panel [ " links " ] = links
if description :
panel [ " description " ] = description
return panel
2025-11-17 14:22:46 -03:00
2026-01-01 14:44:33 -03:00
def namespace_scope_variable ( var_name , label ) :
2026-01-01 14:16:08 -03:00
options = [
{
" text " : " workload namespaces only " ,
" value " : NAMESPACE_SCOPE_WORKLOAD ,
" selected " : True ,
} ,
{ " text " : " all namespaces " , " value " : NAMESPACE_SCOPE_ALL , " selected " : False } ,
{
" text " : " infrastructure namespaces only " ,
" value " : NAMESPACE_SCOPE_INFRA ,
" selected " : False ,
} ,
]
query = (
" workload namespaces only : "
+ NAMESPACE_SCOPE_WORKLOAD
+ " ,all namespaces : "
+ NAMESPACE_SCOPE_ALL
+ " ,infrastructure namespaces only : "
+ NAMESPACE_SCOPE_INFRA
)
return {
2026-01-01 14:44:33 -03:00
" name " : var_name ,
" label " : label ,
2026-01-01 14:16:08 -03:00
" type " : " custom " ,
" query " : query ,
" current " : { " text " : options [ 0 ] [ " text " ] , " value " : options [ 0 ] [ " value " ] , " selected " : True } ,
" options " : options ,
2026-01-01 14:44:33 -03:00
" hide " : 2 ,
2026-01-01 14:16:08 -03:00
" multi " : False ,
" includeAll " : False ,
" refresh " : 1 ,
" sort " : 0 ,
" skipUrlSync " : False ,
}
2026-04-19 14:18:41 -03:00
def namespace_scope_links ( var_name ) :
def with_value ( value ) :
encoded = urllib . parse . quote ( value , safe = " " )
params = [ ]
for other in NAMESPACE_SCOPE_VARS :
if other == var_name :
params . append ( f " var- { other } = { encoded } " )
else :
params . append ( f " var- { other } =$ {{ { other } }} " )
return " ? " + " & " . join ( params )
return [
{ " title " : " Workload namespaces only " , " url " : with_value ( NAMESPACE_SCOPE_WORKLOAD ) , " targetBlank " : False } ,
{ " title " : " All namespaces " , " url " : with_value ( NAMESPACE_SCOPE_ALL ) , " targetBlank " : False } ,
{
" title " : " Infrastructure namespaces only " ,
" url " : with_value ( NAMESPACE_SCOPE_INFRA ) ,
" targetBlank " : False ,
} ,
]
2026-04-18 17:47:06 -03:00
def testing_suite_variable ( ) :
options = [
{
" text " : suite ,
2026-04-22 14:34:40 -03:00
" value " : suite ,
2026-04-18 17:47:06 -03:00
" selected " : False ,
}
for suite in PLATFORM_TEST_SUITE_NAMES
]
2026-04-22 14:34:40 -03:00
query = " , " . join ( f " { suite } : { suite } " for suite in PLATFORM_TEST_SUITE_NAMES )
2026-04-18 17:47:06 -03:00
return {
" name " : " suite " ,
" label " : " Suite " ,
" type " : " custom " ,
" query " : query ,
" current " : { " text " : " All " , " value " : " $__all " , " selected " : True } ,
" options " : options ,
" hide " : 0 ,
" multi " : False ,
" includeAll " : True ,
" allValue " : PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER ,
" refresh " : 1 ,
" sort " : 1 ,
" skipUrlSync " : False ,
}
2026-04-20 08:35:05 -03:00
def testing_case_variable ( ) :
return {
" name " : " test " ,
" label " : " Test Case " ,
" type " : " query " ,
2026-05-15 21:05:13 -03:00
" query " : f ' label_values(platform_quality_gate_test_case_result {{ suite=~ " $ {{ suite:regex }} " ,branch!= " " ,branch=~ " $ {{ branch:regex }} " ,test!= " " ,test!= " __no_test_cases__ " , { PLATFORM_TEST_EXPORT_FILTER } }} , test) ' ,
2026-04-20 08:35:05 -03:00
" current " : { " text " : " All " , " value " : " $__all " , " selected " : True } ,
" options " : [ ] ,
" hide " : 0 ,
" multi " : False ,
" includeAll " : True ,
" allValue " : " .* " ,
" refresh " : 2 ,
" sort " : 1 ,
" skipUrlSync " : False ,
}
2026-04-21 09:35:43 -03:00
def testing_branch_variable ( ) :
return {
" name " : " branch " ,
" label " : " Branch " ,
" type " : " query " ,
2026-05-15 21:05:13 -03:00
" query " : f ' label_values(platform_quality_gate_build_info {{ suite=~ " $ {{ suite:regex }} " ,branch!= " " , { PLATFORM_TEST_EXPORT_FILTER } }} , branch) ' ,
2026-04-21 09:35:43 -03:00
" current " : { " text " : " All " , " value " : " $__all " , " selected " : True } ,
" options " : [ ] ,
" hide " : 0 ,
" multi " : False ,
" includeAll " : True ,
" allValue " : " .* " ,
" refresh " : 2 ,
" sort " : 1 ,
" skipUrlSync " : False ,
}
2026-04-20 13:45:01 -03:00
def jenkins_base_variable ( ) :
return {
" name " : " jenkins_base " ,
" label " : " Jenkins Base URL " ,
" type " : " textbox " ,
" query " : JENKINS_UI_BASE_DEFAULT ,
" current " : {
" text " : JENKINS_UI_BASE_DEFAULT ,
" value " : JENKINS_UI_BASE_DEFAULT ,
" selected " : True ,
} ,
" hide " : 0 ,
" skipUrlSync " : False ,
}
def jenkins_suite_links ( base_var = " $ {jenkins_base} " ) :
links = [ { " title " : " Open Jenkins " , " url " : f " { base_var } / " , " targetBlank " : True } ]
for suite in PLATFORM_TEST_SUITE_NAMES :
job = PLATFORM_TEST_JENKINS_JOB_BY_SUITE . get ( suite , suite )
encoded_job = urllib . parse . quote ( job , safe = " " )
links . append (
{
" title " : f " { suite } : Job " ,
" url " : f " { base_var } /job/ { encoded_job } / " ,
" targetBlank " : True ,
}
)
links . append (
{
" title " : f " { suite } : Last Artifacts " ,
" url " : f " { base_var } /job/ { encoded_job } /lastCompletedBuild/artifact/ " ,
" targetBlank " : True ,
}
)
return links
2026-04-21 11:39:13 -03:00
def jenkins_artifact_data_links ( base_var = " $ {jenkins_base} " ) :
return [
{
" title " : " Open build artifacts " ,
" url " : f " { base_var } /job/$ {{ __field.labels.jenkins_job }} /$ {{ __field.labels.build_number }} /artifact/ " ,
" targetBlank " : True ,
} ,
{
" title " : " Open build " ,
" url " : f " { base_var } /job/$ {{ __field.labels.jenkins_job }} /$ {{ __field.labels.build_number }} / " ,
" targetBlank " : True ,
} ,
]
def jenkins_latest_artifact_data_links ( base_var = " $ {jenkins_base} " ) :
return [
{
" title " : " Open latest artifacts " ,
" url " : f " { base_var } /job/$ {{ __field.labels.jenkins_job }} /lastCompletedBuild/artifact/ " ,
" targetBlank " : True ,
} ,
{
" title " : " Open Jenkins job " ,
" url " : f " { base_var } /job/$ {{ __field.labels.jenkins_job }} / " ,
" targetBlank " : True ,
} ,
]
2025-12-12 20:20:13 -03:00
def bargauge_panel (
panel_id ,
title ,
expr ,
grid ,
* ,
unit = " none " ,
2026-01-21 11:29:29 -03:00
legend = None ,
2025-12-12 20:20:13 -03:00
links = None ,
limit = None ,
2026-01-21 11:29:29 -03:00
sort_order = " desc " ,
2025-12-12 20:20:13 -03:00
thresholds = None ,
decimals = None ,
2025-12-12 20:30:00 -03:00
instant = False ,
2026-01-21 15:01:02 -03:00
overrides = None ,
2026-04-21 11:39:13 -03:00
data_links = None ,
2026-05-12 04:19:36 -03:00
include_color = True ,
2026-05-15 22:43:44 -03:00
description = None ,
2025-12-12 20:20:13 -03:00
) :
2025-12-02 13:16:00 -03:00
""" Return a bar gauge panel with label-aware reduction. """
2026-01-21 15:12:53 -03:00
cleaned_expr = expr . strip ( )
if not cleaned_expr . startswith ( ( " sort( " , " sort_desc( " ) ) :
if sort_order == " desc " :
expr = f " sort_desc( { expr } ) "
elif sort_order == " asc " :
expr = f " sort( { expr } ) "
2026-05-12 04:19:36 -03:00
defaults = { }
if include_color :
defaults [ " color " ] = { " mode " : " thresholds " }
defaults . update (
{
" unit " : unit ,
" min " : 0 ,
" max " : 100 if unit == " percent " else None ,
" thresholds " : thresholds
or {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 70 } ,
{ " color " : " red " , " value " : 85 } ,
] ,
} ,
}
)
2025-12-02 13:16:00 -03:00
panel = {
" id " : panel_id ,
" type " : " bargauge " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : grid ,
2025-12-12 20:30:00 -03:00
" targets " : [
2026-01-21 11:29:29 -03:00
{
" expr " : expr ,
" refId " : " A " ,
" legendFormat " : legend or " {{ node}} " ,
* * ( { " instant " : True } if instant else { } ) ,
}
2025-12-12 20:30:00 -03:00
] ,
2025-12-02 13:16:00 -03:00
" fieldConfig " : {
2026-05-12 04:19:36 -03:00
" defaults " : defaults ,
2025-12-02 13:16:00 -03:00
" overrides " : [ ] ,
} ,
" options " : {
2026-05-15 22:43:44 -03:00
" displayMode " : " basic " ,
2025-12-02 13:16:00 -03:00
" orientation " : " horizontal " ,
" reduceOptions " : {
" calcs " : [ " lastNotNull " ] ,
2025-12-02 14:56:36 -03:00
" fields " : " " ,
2025-12-02 13:16:00 -03:00
" values " : False ,
} ,
} ,
}
2026-01-21 15:01:02 -03:00
if overrides :
panel [ " fieldConfig " ] [ " overrides " ] . extend ( overrides )
2025-12-12 20:20:13 -03:00
if decimals is not None :
panel [ " fieldConfig " ] [ " defaults " ] [ " decimals " ] = decimals
2025-12-02 13:16:00 -03:00
if links :
panel [ " links " ] = links
2026-05-15 22:43:44 -03:00
if description :
panel [ " description " ] = description
2026-04-21 11:39:13 -03:00
if data_links :
panel [ " fieldConfig " ] [ " defaults " ] [ " links " ] = data_links
2025-12-12 18:51:43 -03:00
# Keep bars ordered by value descending for readability.
panel [ " transformations " ] = [
{
" id " : " sortBy " ,
2026-01-21 11:29:29 -03:00
" options " : { " fields " : [ " Value " ] , " order " : sort_order } ,
2025-12-12 18:51:43 -03:00
}
]
2025-12-12 18:56:13 -03:00
if limit :
panel [ " transformations " ] . append ( { " id " : " limit " , " options " : { " limit " : limit } } )
2025-12-02 13:16:00 -03:00
return panel
2026-05-11 01:01:46 -03:00
def set_bargauge_display_mode ( panels , display_mode ) :
""" Apply a display mode to bar gauges, including gauges inside collapsed rows. """
for panel in panels :
if panel . get ( " type " ) == " bargauge " :
panel [ " options " ] [ " displayMode " ] = display_mode
if panel . get ( " panels " ) :
set_bargauge_display_mode ( panel [ " panels " ] , display_mode )
2026-04-19 14:18:41 -03:00
def text_panel ( panel_id , title , content , grid ) :
return {
" id " : panel_id ,
" type " : " text " ,
" title " : title ,
" gridPos " : grid ,
" datasource " : None ,
" options " : { " mode " : " markdown " , " content " : content } ,
}
2026-04-13 00:25:33 -03:00
2026-04-22 16:56:52 -03:00
def row_panel ( panel_id , title , y , * , collapsed = True , panels = None ) :
""" Return a Grafana row, optionally carrying collapsed child panels. """
return {
" id " : panel_id ,
" type " : " row " ,
" title " : title ,
" gridPos " : { " h " : 1 , " w " : 24 , " x " : 0 , " y " : y } ,
" collapsed " : collapsed ,
* * ( { " panels " : panels or [ ] } if collapsed else { } ) ,
}
2026-04-19 14:18:41 -03:00
DASHBOARD_LINK_TITLES = {
" atlas-overview " : " Open Atlas Overview " ,
" atlas-pods " : " Open Atlas Pods " ,
" atlas-nodes " : " Open Atlas Nodes " ,
" atlas-storage " : " Open Atlas Storage " ,
" atlas-network " : " Open Atlas Network " ,
" atlas-mail " : " Open Atlas Mail " ,
2026-05-16 05:08:09 -03:00
" atlas-jobs " : " Atlas Testing " ,
" atlas-testing " : " Atlas Testing " ,
2026-04-19 14:18:41 -03:00
" atlas-power " : " Open Atlas Power " ,
2026-05-15 19:37:03 -03:00
" atlas-gitops " : " Open Atlas GitOps " ,
2026-04-19 14:18:41 -03:00
" atlas-gpu " : " Open Atlas GPU " ,
}
2026-04-13 23:13:45 -03:00
2026-04-19 14:18:41 -03:00
def link_to ( uid ) :
return [
{
" title " : DASHBOARD_LINK_TITLES . get ( uid , f " Open { uid } dashboard " ) ,
" url " : f " /d/ { uid } " ,
" targetBlank " : True ,
}
]
2026-04-13 23:13:45 -03:00
2026-05-12 04:19:36 -03:00
def overview_link_to ( uid ) :
""" Return the historical Overview dashboard link label. """
return [ { " title " : f " Open { uid } dashboard " , " url " : f " /d/ { uid } " , " targetBlank " : True } ]
2026-04-19 14:18:41 -03:00
# ---------------------------------------------------------------------------
# Dashboard builders
# ---------------------------------------------------------------------------
def build_overview ( ) :
panels = [ ]
2026-05-12 04:19:36 -03:00
overview_link = overview_link_to
climate_drop_labels = " job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group "
climate_temp_series = f " max without ( { climate_drop_labels } ) (typhon_temperature_celsius != 0) "
climate_humidity_series = f " max without ( { climate_drop_labels } ) (typhon_relative_humidity_percent != 0) "
climate_pressure_series = f " max without ( { climate_drop_labels } ) (typhon_vpd_kpa != 0) "
2026-05-16 05:08:09 -03:00
overview_pvc_backup_metric_presence = (
' count( { __name__=~ " pvc_backup_(count|last_success_timestamp_seconds|health_reason) " ,driver= " restic " }) '
)
overview_pvc_backup_missing = (
' label_replace(label_replace(vector(999), " namespace " , " maintenance " , " __name__ " , " .* " ), '
' " pvc " , " backup-telemetry-missing " , " __name__ " , " .* " ) '
)
2026-05-12 04:19:36 -03:00
overview_pvc_backup_age = (
' max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds { driver= " restic " }) / 3600) '
' or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason { driver= " restic " ,reason=~ " missing|no_completed|lookup_failed|unknown_timestamp " } > 0) '
2026-05-16 05:08:09 -03:00
f ' * (pvc_backup_count {{ driver= " restic " }} > bool 0)) * 999))) or on() '
f ' (( { overview_pvc_backup_missing } ) unless on() (( { overview_pvc_backup_metric_presence } ) > 0)) '
2026-05-12 04:19:36 -03:00
)
def overview_metric_pair_expr ( first_expr , first_name , second_expr , second_name ) :
return (
f ' label_replace( { first_expr } , " metric " , " { first_name } " , " __name__ " , " .* " ) '
f ' or label_replace( { second_expr } , " metric " , " { second_name } " , " __name__ " , " .* " ) '
)
def overview_platform_test_success_targets ( ) :
suites = [
( " ariadne " , " ariadne " ) ,
( " metis " , " metis " ) ,
( " ananke " , " ananke " ) ,
( " atlasbot " , " atlasbot " ) ,
( " lesavka " , " lesavka " ) ,
( " pegasus " , " pegasus|pegasus-health|pegasus_health " ) ,
( " soteria " , " soteria " ) ,
( " titan-iac " , " titan-iac|titan_iac " ) ,
( " bstein-home " , " bstein-home|bstein_home " ) ,
( " arcanagon " , " arcanagon " ) ,
( " data-prepper " , " data-prepper|data_prepper " ) ,
]
targets = [ ]
for index , ( legend , suite_regex ) in enumerate ( suites ) :
total = f ' sum(increase(platform_quality_gate_runs_total {{ suite=~ " { suite_regex } " }} [1h])) '
passed = (
f ' sum(increase(platform_quality_gate_runs_total {{ suite=~ " { suite_regex } " , '
f ' status=~ " { PLATFORM_TEST_SUCCESS_STATUS } " }} [1h])) '
)
targets . append (
{
" refId " : chr ( ord ( " A " ) + index ) ,
" expr " : f " (100 * ( { passed } ) / clamp_min(( { total } ), 1)) and on() (( { total } ) > 0) or on() vector(0) " ,
" legendFormat " : legend ,
}
)
return targets
2026-04-19 14:18:41 -03:00
2026-01-21 13:37:36 -03:00
age_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 6 } ,
{ " color " : " orange " , " value " : 24 } ,
{ " color " : " red " , " value " : 48 } ,
] ,
}
2025-11-18 15:55:24 -03:00
2025-12-12 15:23:51 -03:00
row1_stats = [
{
" id " : 2 ,
" title " : " Control Plane Ready " ,
" expr " : f ' sum(kube_node_status_condition {{ condition= " Ready " ,status= " true " ,node=~ " { CONTROL_REGEX } " }} ) ' ,
" kind " : " gauge " ,
" max_value " : CONTROL_TOTAL ,
" thresholds " : {
2025-11-17 19:24:03 -03:00
" mode " : " absolute " ,
" steps " : [
2025-11-18 11:12:03 -03:00
{ " color " : " red " , " value " : None } ,
2025-11-17 19:24:03 -03:00
{ " color " : " green " , " value " : CONTROL_TOTAL } ,
] ,
2025-12-12 15:23:51 -03:00
} ,
} ,
{
" id " : 3 ,
" title " : " Control Plane Workloads " ,
" expr " : CONTROL_WORKLOADS_EXPR ,
" kind " : " stat " ,
2025-12-12 20:30:00 -03:00
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
2025-12-12 20:50:41 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 20:30:00 -03:00
] ,
} ,
2026-05-12 04:19:36 -03:00
" links " : overview_link ( " atlas-pods " ) ,
2025-12-12 15:23:51 -03:00
} ,
2025-12-12 15:56:33 -03:00
{
" id " : 5 ,
" title " : " Stuck Terminating " ,
" expr " : STUCK_TERMINATING_EXPR ,
" kind " : " stat " ,
2025-12-12 20:30:00 -03:00
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
2025-12-12 20:50:41 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 20:30:00 -03:00
] ,
} ,
2026-05-12 04:19:36 -03:00
" links " : overview_link ( " atlas-pods " ) ,
2025-12-12 15:56:33 -03:00
} ,
2025-12-12 15:23:51 -03:00
{
" id " : 27 ,
2026-05-10 15:40:12 -03:00
" title " : " Atlas Availability (365d) " ,
2025-12-12 16:11:28 -03:00
" expr " : UPTIME_PERCENT_EXPR ,
2025-12-12 15:23:51 -03:00
" kind " : " stat " ,
2025-12-12 16:11:28 -03:00
" thresholds " : UPTIME_PERCENT_THRESHOLDS ,
2025-12-12 16:15:37 -03:00
" unit " : " percentunit " ,
2025-12-19 15:18:14 -03:00
" decimals " : 4 ,
2025-12-12 15:23:51 -03:00
" text_mode " : " value " ,
2026-05-10 14:40:55 -03:00
" instant " : True ,
2026-05-15 22:07:41 -03:00
" description " : " Rolling 365-day availability from vmalert ' s precomputed atlas:availability:ratio_365d series. Grafana keeps the last successful rollup for up to 24h so one missed long-window evaluation does not render as No data. " ,
2025-12-12 15:23:51 -03:00
} ,
{
" id " : 4 ,
" title " : " Problem Pods " ,
" expr " : PROBLEM_PODS_EXPR ,
" kind " : " stat " ,
2025-12-12 20:30:00 -03:00
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
2025-12-12 20:50:41 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 20:30:00 -03:00
] ,
} ,
2026-05-12 04:19:36 -03:00
" links " : overview_link ( " atlas-pods " ) ,
2025-12-12 15:23:51 -03:00
} ,
{
" id " : 6 ,
" title " : " CrashLoop / ImagePull " ,
" expr " : CRASHLOOP_EXPR ,
" kind " : " stat " ,
2025-12-12 20:30:00 -03:00
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
2025-12-12 20:50:41 -03:00
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 3 } ,
2025-12-12 20:30:00 -03:00
] ,
} ,
2026-05-12 04:19:36 -03:00
" links " : overview_link ( " atlas-pods " ) ,
2025-12-12 15:23:51 -03:00
} ,
{
2025-12-12 15:56:33 -03:00
" id " : 1 ,
" title " : " Workers Ready " ,
" expr " : f ' sum(kube_node_status_condition {{ condition= " Ready " ,status= " true " ,node=~ " { WORKER_REGEX } " }} ) ' ,
" kind " : " gauge " ,
" max_value " : WORKER_TOTAL ,
" thresholds " : {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " orange " , " value " : WORKER_TOTAL - 2 } ,
{ " color " : " yellow " , " value " : WORKER_TOTAL - 1 } ,
{ " color " : " green " , " value " : WORKER_TOTAL } ,
] ,
} ,
2025-12-12 15:23:51 -03:00
} ,
]
def gauge_grid ( idx ) :
width = GAUGE_WIDTHS [ idx ] if idx < len ( GAUGE_WIDTHS ) else 4
x = sum ( GAUGE_WIDTHS [ : idx ] )
return width , x
for idx , item in enumerate ( row1_stats ) :
panel_id = item [ " id " ]
2025-11-18 15:55:24 -03:00
width , x = gauge_grid ( idx )
2025-12-12 15:23:51 -03:00
grid = { " h " : 5 , " w " : width , " x " : x , " y " : 0 }
kind = item . get ( " kind " , " gauge " )
if kind == " stat " :
2025-11-18 17:09:13 -03:00
panels . append (
stat_panel (
panel_id ,
2025-12-12 15:23:51 -03:00
item [ " title " ] ,
item [ " expr " ] ,
grid ,
thresholds = item . get ( " thresholds " ) ,
2026-05-10 14:40:55 -03:00
legend = None ,
links = item . get ( " links " ) ,
text_mode = item . get ( " text_mode " , " value " ) ,
value_suffix = item . get ( " value_suffix " ) ,
unit = item . get ( " unit " , " none " ) ,
decimals = item . get ( " decimals " ) ,
instant = item . get ( " instant " , False ) ,
description = item . get ( " description " ) ,
)
)
2025-11-18 17:09:13 -03:00
else :
panels . append (
gauge_panel (
panel_id ,
2025-12-12 15:23:51 -03:00
item [ " title " ] ,
item [ " expr " ] ,
grid ,
min_value = 0 ,
max_value = item . get ( " max_value " , 5 ) ,
thresholds = item . get ( " thresholds " ) ,
links = item . get ( " links " ) ,
2025-11-18 17:09:13 -03:00
)
2025-11-17 14:22:46 -03:00
)
2025-11-17 16:27:38 -03:00
2026-04-09 14:56:43 -03:00
top_health_panels = [
2025-11-17 21:20:19 -03:00
( 7 , " Hottest node: CPU " , topk_with_node ( node_cpu_expr ( ) ) , " percent " ) ,
( 8 , " Hottest node: RAM " , topk_with_node ( node_mem_expr ( ) ) , " percent " ) ,
2025-11-17 20:19:20 -03:00
( 9 , " Hottest node: NET (rx+tx) " , topk_with_node ( node_net_expr ( ) ) , " Bps " ) ,
( 10 , " Hottest node: I/O (r+w) " , topk_with_node ( node_io_expr ( ) ) , " Bps " ) ,
2026-04-09 14:56:43 -03:00
( 23 , " Astreae Usage " , astreae_usage_expr ( " /mnt/astreae " ) , " percent " ) ,
( 24 , " Asteria Usage " , astreae_usage_expr ( " /mnt/asteria " ) , " percent " ) ,
( 25 , " Astreae Free " , astreae_free_expr ( " /mnt/astreae " ) , " decbytes " ) ,
( 26 , " Asteria Free " , astreae_free_expr ( " /mnt/asteria " ) , " decbytes " ) ,
2025-11-17 16:27:38 -03:00
]
2026-04-09 14:56:43 -03:00
for idx , ( panel_id , title , expr , unit ) in enumerate ( top_health_panels ) :
is_hottest_panel = panel_id in { 7 , 8 , 9 , 10 }
2025-11-17 16:27:38 -03:00
panels . append (
stat_panel (
panel_id ,
title ,
2025-11-17 20:19:20 -03:00
f " { expr } " ,
2026-04-09 14:56:43 -03:00
{ " h " : 2 , " w " : 3 , " x " : 3 * idx , " y " : 5 } ,
2025-11-17 16:27:38 -03:00
unit = unit ,
thresholds = PERCENT_THRESHOLDS if unit == " percent " else None ,
2026-04-09 14:56:43 -03:00
text_mode = " name_and_value " if is_hottest_panel else " value " ,
legend = " {{ node}} " if is_hottest_panel else None ,
instant = is_hottest_panel ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-storage " if panel_id in { 23 , 24 , 25 , 26 } else " atlas-nodes " ) ,
2025-11-17 16:27:38 -03:00
)
)
2026-01-05 21:55:59 -03:00
mail_bounce_rate_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 5 } ,
{ " color " : " orange " , " value " : 8 } ,
{ " color " : " red " , " value " : 10 } ,
] ,
}
2026-01-06 02:06:20 -03:00
mail_limit_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 70 } ,
{ " color " : " orange " , " value " : 85 } ,
{ " color " : " red " , " value " : 95 } ,
] ,
}
2026-01-06 02:34:52 -03:00
mail_success_thresholds = {
2026-01-05 21:55:59 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
2026-01-06 02:34:52 -03:00
{ " color " : " orange " , " value " : 90 } ,
{ " color " : " yellow " , " value " : 95 } ,
{ " color " : " green " , " value " : 98 } ,
2026-01-05 21:55:59 -03:00
] ,
}
2026-05-15 22:43:44 -03:00
dark_red = " dark-red "
dark_orange = " dark-orange "
dark_yellow = " dark-yellow "
dark_green = " dark-green "
dark_blue = " dark-blue "
2026-05-15 22:07:41 -03:00
test_success_thresholds = {
" mode " : " absolute " ,
" steps " : [
2026-05-15 22:43:44 -03:00
{ " color " : dark_red , " value " : None } ,
{ " color " : dark_orange , " value " : 70 } ,
{ " color " : dark_yellow , " value " : 85 } ,
{ " color " : dark_green , " value " : 95 } ,
{ " color " : dark_blue , " value " : 100 } ,
2026-05-15 22:07:41 -03:00
] ,
}
2026-05-16 05:08:09 -03:00
fan_intensity_thresholds = {
" mode " : " absolute " ,
" steps " : [
2026-05-16 05:34:24 -03:00
{ " color " : " #1f60c4 " , " value " : None } ,
{ " color " : " #2870b8 " , " value " : 1 } ,
{ " color " : " #2f8599 " , " value " : 2 } ,
{ " color " : " #2f9e44 " , " value " : 3 } ,
{ " color " : " #76a935 " , " value " : 4 } ,
{ " color " : " #d4b106 " , " value " : 5 } ,
{ " color " : " #d69605 " , " value " : 6 } ,
{ " color " : " #e06c00 " , " value " : 7 } ,
{ " color " : " #d95718 " , " value " : 8 } ,
{ " color " : " #c92a2a " , " value " : 9 } ,
{ " color " : " #8f1d1d " , " value " : 10 } ,
2026-05-16 05:08:09 -03:00
] ,
}
fan_intensity_expr = (
f ' label_replace(max without ( { climate_drop_labels } ) (typhon_fan_speed_level {{ port= " 1 " }} ), " fan " , " Outlet " , " __name__ " , " .* " ) '
f ' or label_replace(max without ( { climate_drop_labels } ) (typhon_fan_speed_level {{ port= " 2 " }} ), " fan " , " Inlet - Inside " , " __name__ " , " .* " ) '
f ' or label_replace(max without ( { climate_drop_labels } ) (typhon_fan_speed_level {{ port= " 3 " }} ), " fan " , " Inlet - Outside " , " __name__ " , " .* " ) '
f ' or label_replace(max without ( { climate_drop_labels } ) (typhon_fan_speed_level {{ port= " 4 " }} ), " fan " , " Interior " , " __name__ " , " .* " ) '
)
gitops_health_history_expr = (
f ' label_replace( { GITOPS_KUSTOMIZATION_READY_PCT } , " signal " , " Kustomizations Ready " , " __name__ " , " .* " ) '
f ' or label_replace( { GITOPS_HELM_READY_PCT } , " signal " , " HelmReleases Ready " , " __name__ " , " .* " ) '
f ' or label_replace( { GITOPS_KUSTOMIZATION_NOT_SUSPENDED_PCT } , " signal " , " Kustomizations Not Suspended " , " __name__ " , " .* " ) '
f ' or label_replace( { GITOPS_HELM_NOT_SUSPENDED_PCT } , " signal " , " HelmReleases Not Suspended " , " __name__ " , " .* " ) '
)
2026-05-16 03:31:04 -03:00
compact_current_text = { " titleSize " : 11 , " valueSize " : 20 }
perfect_count_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : dark_red , " value " : None } ,
{ " color " : dark_yellow , " value " : max ( len ( PLATFORM_TEST_SUITE_NAMES ) - 2 , 1 ) } ,
{ " color " : dark_green , " value " : len ( PLATFORM_TEST_SUITE_NAMES ) - 1 } ,
{ " color " : dark_blue , " value " : len ( PLATFORM_TEST_SUITE_NAMES ) } ,
] ,
}
failure_count_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : dark_blue , " value " : None } ,
{ " color " : dark_yellow , " value " : 1 } ,
{ " color " : dark_orange , " value " : 3 } ,
{ " color " : dark_red , " value " : 5 } ,
] ,
}
overview_avg_coverage = f " (avg(( { QUALITY_GATE_COVERAGE_BY_SUITE } )) or on() vector(0)) "
overview_loc_clean_suites = f " (sum(( { QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE } ) == bool 0) or on() vector(0)) "
2026-05-12 04:19:36 -03:00
for panel_id , title , draw_expr , runtime_expr , y_pos in [
( 40 , " Pyrphoros UPS Current " , ANANKE_UPS_DRAW_WATTS_DB , ANANKE_UPS_RUNTIME_DB , 7 ) ,
( 144 , " Statera UPS Current " , ANANKE_UPS_DRAW_WATTS_TETHYS , ANANKE_UPS_RUNTIME_TETHYS , 10 ) ,
] :
panel = stat_panel (
panel_id ,
title ,
2026-04-19 14:18:41 -03:00
None ,
2026-05-16 03:31:04 -03:00
{ " h " : 3 , " w " : 3 , " x " : 0 , " y " : y_pos } ,
2026-04-19 14:18:41 -03:00
unit = " none " ,
2026-04-13 06:22:41 -03:00
text_mode = " name_and_value " ,
2026-04-19 14:18:41 -03:00
targets = [
2026-05-12 04:19:36 -03:00
{
" expr " : overview_metric_pair_expr ( draw_expr , " Draw " , runtime_expr , " Runtime " ) ,
" refId " : " A " ,
" legendFormat " : " {{ metric}} " ,
" instant " : True ,
}
2026-04-03 22:16:02 -03:00
] ,
2026-04-13 03:35:39 -03:00
field_overrides = [
2026-05-12 04:19:36 -03:00
{ " matcher " : { " id " : " byName " , " options " : " Draw " } , " properties " : [ { " id " : " unit " , " value " : " watt " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Runtime " } , " properties " : [ { " id " : " unit " , " value " : " s " } ] } ,
2026-04-13 01:08:58 -03:00
] ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-power " ) ,
2026-04-03 14:55:16 -03:00
)
2026-05-16 03:31:04 -03:00
panel [ " options " ] [ " text " ] = compact_current_text
2026-05-12 04:19:36 -03:00
panels . append ( panel )
2026-05-16 06:11:22 -03:00
ups_history = timeseries_panel (
41 ,
" UPS History (Power Draw) " ,
None ,
{ " h " : 6 , " w " : 6 , " x " : 3 , " y " : 7 } ,
unit = " watt " ,
targets = [
{ " refId " : " A " , " expr " : ANANKE_UPS_DRAW_WATTS_DB_SERIES , " legendFormat " : ANANKE_UPS_DB_NAME } ,
{ " refId " : " B " , " expr " : ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES , " legendFormat " : ANANKE_UPS_TETHYS_NAME } ,
] ,
field_overrides = fixed_color_overrides (
{ ANANKE_UPS_DB_NAME : dark_blue , ANANKE_UPS_TETHYS_NAME : dark_yellow }
) ,
legend_display = " list " ,
legend_placement = " bottom " ,
links = overview_link ( " atlas-power " ) ,
2026-04-03 14:55:16 -03:00
)
2026-05-16 06:11:22 -03:00
ups_history [ " fieldConfig " ] [ " defaults " ] [ " custom " ] = {
" drawStyle " : " line " ,
" lineInterpolation " : " linear " ,
" lineWidth " : 2 ,
" fillOpacity " : 18 ,
" showPoints " : " never " ,
" spanNulls " : True ,
}
panels . append ( ups_history )
2026-05-12 04:19:36 -03:00
temp_panel = stat_panel (
42 ,
" Current Enclosure Temperature " ,
None ,
2026-05-16 03:31:04 -03:00
{ " h " : 3 , " w " : 3 , " x " : 0 , " y " : 13 } ,
2026-05-12 04:19:36 -03:00
unit = " none " ,
text_mode = " name_and_value " ,
targets = [
{
" expr " : overview_metric_pair_expr (
f " max( { climate_temp_series } ) or on() vector(0) " ,
" °C " ,
f " max(( { climate_temp_series } ) * 9 / 5 + 32) or on() vector(0) " ,
" °F " ,
) ,
" refId " : " A " ,
" legendFormat " : " {{ metric}} " ,
" instant " : True ,
}
] ,
field_overrides = [
{ " matcher " : { " id " : " byName " , " options " : " °C " } , " properties " : [ { " id " : " unit " , " value " : " celsius " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " °F " } , " properties " : [ { " id " : " unit " , " value " : " fahrenheit " } ] } ,
] ,
links = overview_link ( " atlas-power " ) ,
)
2026-05-16 03:31:04 -03:00
temp_panel [ " options " ] [ " text " ] = compact_current_text
2026-05-12 04:19:36 -03:00
panels . append ( temp_panel )
climate_panel = stat_panel (
143 ,
" Current Enclosure Climate " ,
None ,
2026-05-16 03:31:04 -03:00
{ " h " : 3 , " w " : 3 , " x " : 0 , " y " : 16 } ,
2026-05-12 04:19:36 -03:00
unit = " none " ,
text_mode = " name_and_value " ,
targets = [
{
" expr " : overview_metric_pair_expr (
f " max( { climate_humidity_series } ) or on() vector(0) " ,
" % RH " ,
f " max( { climate_pressure_series } ) or on() vector(0) " ,
" kPa " ,
) ,
" refId " : " A " ,
" legendFormat " : " {{ metric}} " ,
" instant " : True ,
}
] ,
field_overrides = [
{ " matcher " : { " id " : " byName " , " options " : " % RH " } , " properties " : [ { " id " : " unit " , " value " : " suffix: % RH " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " kPa " } , " properties " : [ { " id " : " unit " , " value " : " suffix:kPa " } ] } ,
] ,
links = overview_link ( " atlas-power " ) ,
2026-04-13 03:35:39 -03:00
)
2026-05-16 03:31:04 -03:00
climate_panel [ " options " ] [ " text " ] = compact_current_text
2026-05-12 04:19:36 -03:00
panels . append ( climate_panel )
2026-04-19 14:18:41 -03:00
panels . append (
timeseries_panel (
2026-04-03 22:16:02 -03:00
43 ,
2026-05-12 04:19:36 -03:00
" Enclosure Climate History " ,
2026-04-03 22:16:02 -03:00
None ,
2026-05-16 03:31:04 -03:00
{ " h " : 6 , " w " : 6 , " x " : 3 , " y " : 13 } ,
2026-05-12 04:19:36 -03:00
unit = " none " ,
2026-04-03 22:16:02 -03:00
targets = [
2026-05-12 04:19:36 -03:00
{ " refId " : " A " , " expr " : climate_temp_series , " legendFormat " : " C " } ,
{ " refId " : " B " , " expr " : climate_humidity_series , " legendFormat " : " RH " } ,
{ " refId " : " C " , " expr " : climate_pressure_series , " legendFormat " : " P " } ,
{ " refId " : " D " , " expr " : f " (min_over_time( { climate_temp_series } [$__range]) - 0.08) " , " legendFormat " : " C bound min " } ,
{ " refId " : " E " , " expr " : f " (max_over_time( { climate_temp_series } [$__range]) + 0.08) " , " legendFormat " : " C bound max " } ,
{ " refId " : " F " , " expr " : f " clamp_min((min_over_time( { climate_humidity_series } [$__range]) - 0.35), 0) " , " legendFormat " : " RH bound min " } ,
{ " refId " : " G " , " expr " : f " clamp_max((max_over_time( { climate_humidity_series } [$__range]) + 0.35), 100) " , " legendFormat " : " RH bound max " } ,
{ " refId " : " H " , " expr " : f " clamp_min((min_over_time( { climate_pressure_series } [$__range]) - 0.03), 0) " , " legendFormat " : " P bound min " } ,
{ " refId " : " I " , " expr " : f " (max_over_time( { climate_pressure_series } [$__range]) + 0.03) " , " legendFormat " : " P bound max " } ,
2026-04-03 22:16:02 -03:00
] ,
field_overrides = [
2026-04-12 22:53:23 -03:00
{
2026-05-12 04:19:36 -03:00
" matcher " : { " id " : " byName " , " options " : " C " } ,
" properties " : [
{ " id " : " unit " , " value " : " suffix:°C " } ,
{ " id " : " decimals " , " value " : 2 } ,
{ " id " : " custom.axisPlacement " , " value " : " left " } ,
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
] ,
} ,
{
" matcher " : { " id " : " byRegexp " , " options " : " C bound .* " } ,
2026-04-13 00:17:29 -03:00
" properties " : [
2026-05-12 04:19:36 -03:00
{ " id " : " unit " , " value " : " suffix:°C " } ,
{ " id " : " custom.axisPlacement " , " value " : " left " } ,
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
{ " id " : " custom.hideFrom " , " value " : { " legend " : True , " tooltip " : True , " viz " : False } } ,
{ " id " : " custom.lineWidth " , " value " : 0 } ,
{ " id " : " custom.fillOpacity " , " value " : 0 } ,
{ " id " : " custom.showPoints " , " value " : " never " } ,
{ " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " transparent " } } ,
2026-04-13 00:17:29 -03:00
] ,
} ,
2026-04-12 17:28:15 -03:00
{
2026-05-12 04:19:36 -03:00
" matcher " : { " id " : " byName " , " options " : " RH " } ,
2026-04-03 22:16:02 -03:00
" properties " : [
2026-05-12 04:19:36 -03:00
{ " id " : " unit " , " value " : " suffix: % " } ,
{ " id " : " decimals " , " value " : 2 } ,
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
] ,
} ,
{
" matcher " : { " id " : " byRegexp " , " options " : " RH bound .* " } ,
" properties " : [
{ " id " : " unit " , " value " : " suffix: % " } ,
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
{ " id " : " custom.hideFrom " , " value " : { " legend " : True , " tooltip " : True , " viz " : False } } ,
{ " id " : " custom.lineWidth " , " value " : 0 } ,
{ " id " : " custom.fillOpacity " , " value " : 0 } ,
{ " id " : " custom.showPoints " , " value " : " never " } ,
{ " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " transparent " } } ,
] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : " P " } ,
" properties " : [
{ " id " : " unit " , " value " : " suffix:kPa " } ,
2026-04-03 22:16:02 -03:00
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
{ " id " : " decimals " , " value " : 2 } ,
2026-05-12 04:19:36 -03:00
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
2026-04-13 00:17:29 -03:00
] ,
2026-05-12 04:19:36 -03:00
} ,
{
" matcher " : { " id " : " byRegexp " , " options " : " P bound .* " } ,
" properties " : [
{ " id " : " unit " , " value " : " suffix:kPa " } ,
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
{ " id " : " custom.axisCenteredZero " , " value " : False } ,
{ " id " : " custom.hideFrom " , " value " : { " legend " : True , " tooltip " : True , " viz " : False } } ,
{ " id " : " custom.lineWidth " , " value " : 0 } ,
{ " id " : " custom.fillOpacity " , " value " : 0 } ,
{ " id " : " custom.showPoints " , " value " : " never " } ,
{ " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " transparent " } } ,
] ,
} ,
2026-04-03 22:16:02 -03:00
] ,
2026-04-12 18:35:15 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-power " ) ,
description = " Temperature on left axis, humidity and pressure on right axis with dynamic bound series so small swings remain visible. " ,
2026-04-03 22:16:02 -03:00
)
2026-04-19 14:18:41 -03:00
)
2026-05-12 04:19:36 -03:00
panels [ - 1 ] [ " fieldConfig " ] [ " defaults " ] [ " custom " ] = {
" drawStyle " : " line " ,
" lineInterpolation " : " linear " ,
" lineWidth " : 2 ,
" fillOpacity " : 10 ,
" showPoints " : " never " ,
" spanNulls " : True ,
}
2026-05-16 05:08:09 -03:00
fan_panel = state_timeline_panel (
141 ,
" Fan Intensity History " ,
fan_intensity_expr ,
{ " h " : 6 , " w " : 6 , " x " : 9 , " y " : 13 } ,
unit = " none " ,
min_value = 0 ,
max_value = 10 ,
legend = " {{ fan}} " ,
thresholds = fan_intensity_thresholds ,
links = overview_link ( " atlas-power " ) ,
description = " Fan intensity lanes on the 0-10 controller scale. Cooler colors are quiet/low intensity; warmer colors mean the enclosure is pushing harder. " ,
2026-04-03 14:55:16 -03:00
)
2026-05-16 06:11:22 -03:00
fan_panel [ " options " ] [ " legend " ] = { " displayMode " : " list " , " placement " : " bottom " }
fan_panel [ " options " ] [ " mergeValues " ] = False
fan_panel [ " options " ] [ " showValue " ] = " auto "
2026-05-16 05:08:09 -03:00
fan_panel [ " options " ] [ " tooltip " ] = { " mode " : " multi " , " sort " : " none " }
panels . append ( fan_panel )
2026-05-15 22:43:44 -03:00
flux_source = stat_panel (
140 ,
" Flux Source " ,
2026-05-15 19:37:03 -03:00
None ,
2026-05-16 03:31:04 -03:00
{ " h " : 2 , " w " : 3 , " x " : 21 , " y " : 7 } ,
2026-05-15 19:37:03 -03:00
unit = " none " ,
2026-05-15 22:43:44 -03:00
text_mode = " name " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : dark_red , " value " : None } ,
{ " color " : dark_blue , " value " : 1 } ,
] ,
} ,
2026-05-15 19:37:03 -03:00
targets = [
2026-05-15 22:43:44 -03:00
{
" expr " : f " { GITOPS_SOURCE_INFO } or on() vector(0) " ,
" refId " : " A " ,
2026-05-16 02:21:05 -03:00
" legendFormat " : " {{ branch}} " ,
2026-05-15 22:43:44 -03:00
" instant " : True ,
}
2026-05-15 19:37:03 -03:00
] ,
links = overview_link ( " atlas-gitops " ) ,
2026-05-16 02:21:05 -03:00
description = " Flux GitRepository branch reported by Ananke. Revision and object detail live in Atlas GitOps. " ,
2026-05-15 22:07:41 -03:00
)
2026-05-15 22:43:44 -03:00
flux_source [ " options " ] [ " graphMode " ] = " none "
2026-05-16 03:31:04 -03:00
flux_source [ " options " ] [ " text " ] = { " titleSize " : 10 , " valueSize " : 14 }
2026-05-15 22:43:44 -03:00
panels . append ( flux_source )
2026-05-16 03:31:04 -03:00
for panel_id , title , expr , y_pos , unit , decimals , thresholds , links in [
( 151 , " Run Reliability (24h) " , TEST_SUCCESS_RATE_24H , 9 , " percent " , 1 , test_success_thresholds , " atlas-testing " ) ,
( 152 , " Failed Runs (24h) " , TEST_FAILURES_24H_TOTAL , 11 , " none " , 0 , failure_count_thresholds , " atlas-testing " ) ,
( 153 , " Fresh Suites (24h) " , PLATFORM_TEST_ACTIVE_SUITES_24H , 13 , " none " , 0 , perfect_count_thresholds , " atlas-testing " ) ,
( 154 , " Avg Coverage " , overview_avg_coverage , 15 , " percent " , 1 , test_success_thresholds , " atlas-testing " ) ,
( 155 , " LOC Clean Suites " , overview_loc_clean_suites , 17 , " none " , 0 , perfect_count_thresholds , " atlas-testing " ) ,
] :
rail_panel = stat_panel (
panel_id ,
title ,
expr ,
{ " h " : 2 , " w " : 3 , " x " : 21 , " y " : y_pos } ,
unit = unit ,
decimals = decimals ,
instant = True ,
thresholds = thresholds ,
links = overview_link ( links ) ,
)
rail_panel [ " options " ] [ " graphMode " ] = " none "
rail_panel [ " options " ] [ " text " ] = { " titleSize " : 10 , " valueSize " : 19 }
panels . append ( rail_panel )
2026-05-15 22:07:41 -03:00
panels . append (
2026-05-16 05:08:09 -03:00
state_timeline_panel (
2026-05-15 22:43:44 -03:00
150 ,
" GitOps Health " ,
2026-05-16 05:08:09 -03:00
gitops_health_history_expr ,
{ " h " : 6 , " w " : 6 , " x " : 15 , " y " : 7 } ,
2026-05-15 22:43:44 -03:00
unit = " percent " ,
2026-05-16 05:08:09 -03:00
min_value = 0 ,
max_value = 100 ,
2026-05-15 22:43:44 -03:00
legend = " {{ signal}} " ,
2026-05-15 22:07:41 -03:00
thresholds = test_success_thresholds ,
links = overview_link ( " atlas-gitops " ) ,
2026-05-16 05:08:09 -03:00
description = " GitOps readiness and suspension health over time. Blue means perfect; warmer colors mean a readiness or suspension problem appeared. " ,
2026-05-15 22:07:41 -03:00
)
2026-05-15 19:37:03 -03:00
)
2026-04-03 14:55:16 -03:00
panels . append (
2026-04-09 16:35:14 -03:00
bargauge_panel (
2026-04-03 14:55:16 -03:00
44 ,
2026-04-09 16:35:14 -03:00
" One-off Job Pods (age hours) " ,
ONEOFF_JOB_POD_AGE_HOURS ,
2026-05-12 04:19:36 -03:00
{ " h " : 5 , " w " : 8 , " x " : 0 , " y " : 32 } ,
2026-04-04 01:33:15 -03:00
unit = " h " ,
2026-04-03 14:55:16 -03:00
instant = True ,
2026-04-09 16:35:14 -03:00
legend = " {{ namespace}}/ {{ pod}} " ,
thresholds = age_thresholds ,
limit = 12 ,
decimals = 2 ,
2026-05-16 03:04:27 -03:00
links = overview_link ( " atlas-testing " ) ,
2026-05-12 04:19:36 -03:00
include_color = False ,
2026-04-03 14:55:16 -03:00
)
)
2026-05-15 22:07:41 -03:00
ariadne_volume = timeseries_panel (
45 ,
" Ariadne Run Volume " ,
2026-04-08 23:33:17 -03:00
None ,
2026-05-16 03:31:04 -03:00
{ " h " : 6 , " w " : 6 , " x " : 9 , " y " : 7 } ,
2026-05-15 22:07:41 -03:00
unit = " none " ,
targets = [
{ " expr " : f " { ARIADNE_TASK_ATTEMPTS_SERIES } or on() vector(0) " , " refId " : " A " , " legendFormat " : " Attempts " } ,
{ " expr " : f " { ARIADNE_TASK_FAILURES_SERIES } or on() vector(0) " , " refId " : " B " , " legendFormat " : " Failures " } ,
] ,
2026-05-16 05:58:59 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
2026-05-16 03:04:27 -03:00
links = overview_link ( " atlas-testing " ) ,
2026-04-03 14:55:16 -03:00
)
2026-05-15 22:43:44 -03:00
ariadne_volume [ " fieldConfig " ] [ " overrides " ] = fixed_color_overrides (
{ " Attempts " : dark_blue , " Failures " : dark_red }
)
2026-05-15 22:07:41 -03:00
panels . append ( apply_bar_timeseries_style ( ariadne_volume , stacked = False ) )
panels . append (
2026-05-16 05:08:09 -03:00
state_timeline_panel (
2026-05-15 22:07:41 -03:00
46 ,
2026-05-15 22:43:44 -03:00
" Gate Checks Passing by Suite " ,
2026-05-15 22:07:41 -03:00
PLATFORM_TEST_CURRENT_GATE_HEALTH_BY_SUITE ,
2026-05-16 05:08:09 -03:00
{ " h " : 6 , " w " : 6 , " x " : 15 , " y " : 13 } ,
2026-05-15 22:07:41 -03:00
unit = " percent " ,
2026-05-16 05:08:09 -03:00
min_value = 0 ,
max_value = 100 ,
2026-05-15 22:07:41 -03:00
legend = " {{ suite}} " ,
thresholds = test_success_thresholds ,
links = overview_link ( " atlas-testing " ) ,
2026-05-16 05:08:09 -03:00
description = " Percent of current gate dimensions passing per suite over time. There are seven gate dimensions, so 85.7 % means one gate is failing. " ,
2026-05-15 22:07:41 -03:00
)
)
2026-05-16 05:18:53 -03:00
panels [ - 1 ] [ " options " ] [ " legend " ] = { " displayMode " : " hidden " , " placement " : " bottom " }
2026-05-16 06:11:22 -03:00
panels [ - 1 ] [ " options " ] [ " mergeValues " ] = False
panels [ - 1 ] [ " options " ] [ " showValue " ] = " auto "
2026-05-12 04:19:36 -03:00
for panel_id , title , metric , x_pos , description in [
(
142 ,
" Jenkins Last Success (h, newest first) " ,
" ariadne_jenkins_build_weather_job_last_success_timestamp_seconds " ,
8 ,
" Top 6 most recent Jenkins successes by age (newest first). Green means last run succeeded; red means last run did not succeed. Use Atlas Jobs for the full list. " ,
) ,
(
243 ,
" Jenkins Last Failure (h, newest first) " ,
" ariadne_jenkins_build_weather_job_last_failure_timestamp_seconds " ,
12 ,
" Top 6 most recent Jenkins failures by age (newest first). Green means last run succeeded; red means last run did not succeed. Use Atlas Jobs for the full list. " ,
) ,
] :
base_expr = f " min by (exported_job,job_url,weather_icon) ((time() - { metric } ) / 3600) "
topk_expr = f " sort(bottomk(6, { base_expr } )) "
success_expr = (
f ' label_replace(( { topk_expr } ) and on(exported_job,job_url,weather_icon) '
' (max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) == 1), '
' " run_state " , " ok " , " exported_job " , " .* " ) '
)
failure_expr = (
f ' label_replace(( { topk_expr } ) and on(exported_job,job_url,weather_icon) '
' (max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) != 1), '
' " run_state " , " bad " , " exported_job " , " .* " ) '
)
panels . append (
{
" id " : panel_id ,
" type " : " stat " ,
" title " : title ,
" datasource " : PROM_DS ,
" gridPos " : { " h " : 5 , " w " : 4 , " x " : x_pos , " y " : 32 } ,
" targets " : [
{
" refId " : " A " ,
" expr " : f " sort(( { success_expr } ) or ( { failure_expr } )) " ,
" instant " : True ,
}
] ,
" fieldConfig " : {
" defaults " : {
" unit " : " h " ,
" decimals " : 1 ,
" min " : 0 ,
" displayName " : " $ {__field.labels.weather_icon} $ {__field.labels.exported_job} " ,
" links " : [
{
" title " : " Open Jenkins job " ,
" url " : " https://ci.bstein.dev/job/$ {__field.labels.exported_job} / " ,
" targetBlank " : True ,
}
] ,
} ,
" overrides " : [
{
" matcher " : { " id " : " byRegexp " , " options " : ' .*run_state= " ok " .* ' } ,
" properties " : [ { " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " green " } } ] ,
} ,
{
" matcher " : { " id " : " byRegexp " , " options " : ' .*run_state= " bad " .* ' } ,
" properties " : [ { " id " : " color " , " value " : { " mode " : " fixed " , " fixedColor " : " red " } } ] ,
} ,
] ,
} ,
" options " : {
" colorMode " : " value " ,
" graphMode " : " none " ,
" justifyMode " : " left " ,
" orientation " : " horizontal " ,
" wideLayout " : True ,
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" textMode " : " name_and_value " ,
" text " : { " titleSize " : 11 , " valueSize " : 11 } ,
} ,
" transformations " : [ { " id " : " sortBy " , " options " : { " fields " : [ " Value " ] , " order " : " asc " } } ] ,
2026-05-16 03:04:27 -03:00
" links " : overview_link ( " atlas-testing " ) ,
2026-05-12 04:19:36 -03:00
" description " : description ,
}
)
2026-04-09 19:27:48 -03:00
panels . append (
2026-04-09 20:16:44 -03:00
bargauge_panel (
2026-04-09 19:27:48 -03:00
47 ,
2026-04-11 11:54:43 -03:00
" PVC Backup Health / Age " ,
2026-05-12 04:19:36 -03:00
overview_pvc_backup_age ,
{ " h " : 5 , " w " : 8 , " x " : 16 , " y " : 32 } ,
2026-04-11 11:54:43 -03:00
unit = " h " ,
2026-04-09 19:27:48 -03:00
instant = True ,
2026-04-11 11:54:43 -03:00
legend = " {{ namespace}}/ {{ pvc}} " ,
2026-04-09 20:16:44 -03:00
sort_order = " desc " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
2026-04-11 11:54:43 -03:00
{ " color " : " green " , " value " : None } ,
2026-04-14 02:14:43 -03:00
{ " color " : " yellow " , " value " : 20 } ,
{ " color " : " orange " , " value " : 40 } ,
{ " color " : " red " , " value " : 50 } ,
2026-04-09 20:16:44 -03:00
] ,
} ,
2026-05-12 04:19:36 -03:00
include_color = False ,
2026-04-09 19:27:48 -03:00
)
2026-04-03 14:55:16 -03:00
)
2026-05-12 04:19:36 -03:00
panels [ - 1 ] [ " links " ] = overview_link ( " atlas-storage " )
2026-04-09 20:16:44 -03:00
panels [ - 1 ] [ " description " ] = (
2026-05-12 04:19:36 -03:00
" Backup age in hours computed from last-success timestamps for restic-managed PVCs (nightly target: <=20h green, <40h yellow, <50h orange, >=50h red). PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility. "
2026-04-09 20:16:44 -03:00
)
2026-04-03 14:55:16 -03:00
2026-01-06 02:34:52 -03:00
panels . append (
stat_panel (
2026-01-05 21:55:59 -03:00
30 ,
2026-01-06 02:34:52 -03:00
" Mail Sent (1d) " ,
' max(postmark_outbound_sent { window= " 1d " }) ' ,
2026-05-12 04:19:36 -03:00
{ " h " : 2 , " w " : 4 , " x " : 0 , " y " : 19 } ,
2026-01-06 02:34:52 -03:00
unit = " none " ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-mail " ) ,
2026-01-06 02:34:52 -03:00
)
)
panels . append (
{
" id " : 31 ,
" type " : " stat " ,
" title " : " Mail Bounces (1d) " ,
" datasource " : PROM_DS ,
2026-05-12 04:19:36 -03:00
" gridPos " : { " h " : 2 , " w " : 4 , " x " : 8 , " y " : 19 } ,
2026-01-06 02:34:52 -03:00
" targets " : [
{
" expr " : ' max(postmark_outbound_bounce_rate { window= " 1d " }) ' ,
" refId " : " A " ,
" legendFormat " : " Rate " ,
} ,
{
" expr " : ' max(postmark_outbound_bounced { window= " 1d " }) ' ,
" refId " : " B " ,
" legendFormat " : " Count " ,
} ,
] ,
" fieldConfig " : {
" defaults " : {
" color " : { " mode " : " thresholds " } ,
" custom " : { " displayMode " : " auto " } ,
" thresholds " : mail_bounce_rate_thresholds ,
" unit " : " none " ,
} ,
" overrides " : [
{
" matcher " : { " id " : " byName " , " options " : " Rate " } ,
" properties " : [ { " id " : " unit " , " value " : " percent " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : " Count " } ,
" properties " : [ { " id " : " unit " , " value " : " none " } ] ,
} ,
] ,
} ,
" options " : {
" colorMode " : " value " ,
" graphMode " : " area " ,
" justifyMode " : " center " ,
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" textMode " : " name_and_value " ,
} ,
2026-05-12 04:19:36 -03:00
" links " : overview_link ( " atlas-mail " ) ,
2026-01-06 02:34:52 -03:00
}
)
panels . append (
stat_panel (
2026-01-05 21:55:59 -03:00
32 ,
2026-01-06 02:34:52 -03:00
" Mail Success Rate (1d) " ,
' clamp_min(100 - max(postmark_outbound_bounce_rate { window= " 1d " }), 0) ' ,
2026-05-12 04:19:36 -03:00
{ " h " : 2 , " w " : 4 , " x " : 4 , " y " : 19 } ,
2026-01-06 02:34:52 -03:00
unit = " percent " ,
thresholds = mail_success_thresholds ,
decimals = 1 ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-mail " ) ,
2026-01-06 02:34:52 -03:00
)
)
panels . append (
stat_panel (
33 ,
2026-01-06 02:06:20 -03:00
" Mail Limit Used (30d) " ,
" max(postmark_sending_limit_used_percent) " ,
2026-05-12 04:19:36 -03:00
{ " h " : 2 , " w " : 4 , " x " : 12 , " y " : 19 } ,
2026-01-06 02:34:52 -03:00
unit = " percent " ,
thresholds = mail_limit_thresholds ,
decimals = 1 ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-mail " ) ,
2026-01-05 21:55:59 -03:00
)
2026-01-06 02:34:52 -03:00
)
2026-01-22 15:23:23 -03:00
panels . append (
2026-01-22 18:23:17 -03:00
stat_panel (
2026-01-22 15:23:23 -03:00
34 ,
" Postgres Connections Used " ,
2026-01-22 18:23:17 -03:00
POSTGRES_CONN_USED ,
2026-05-12 04:19:36 -03:00
{ " h " : 2 , " w " : 4 , " x " : 16 , " y " : 19 } ,
2026-01-22 18:23:17 -03:00
decimals = 0 ,
text_mode = " name_and_value " ,
legend = " {{ conn}} " ,
instant = True ,
2026-01-22 15:23:23 -03:00
)
)
panels . append (
stat_panel (
35 ,
" Postgres Hottest Connections " ,
POSTGRES_CONN_HOTTEST ,
2026-05-12 04:19:36 -03:00
{ " h " : 2 , " w " : 4 , " x " : 20 , " y " : 19 } ,
2026-01-22 15:23:23 -03:00
unit = " none " ,
decimals = 0 ,
text_mode = " name_and_value " ,
legend = " {{ datname}} " ,
instant = True ,
)
)
2026-01-05 21:55:59 -03:00
2026-01-01 14:44:33 -03:00
cpu_scope = " $namespace_scope_cpu "
gpu_scope = " $namespace_scope_gpu "
ram_scope = " $namespace_scope_ram "
2025-11-17 14:22:46 -03:00
panels . append (
2025-11-17 16:27:38 -03:00
pie_panel (
11 ,
2025-12-02 14:41:39 -03:00
" Namespace CPU Share " ,
2026-01-01 14:44:33 -03:00
namespace_cpu_share_expr ( cpu_scope ) ,
2026-04-04 01:33:15 -03:00
{ " h " : 9 , " w " : 8 , " x " : 0 , " y " : 23 } ,
2026-01-01 14:44:33 -03:00
links = namespace_scope_links ( " namespace_scope_cpu " ) ,
2026-01-18 02:50:07 -03:00
description = " Shares are normalized within the selected filter. Switching scope changes the denominator. " ,
2025-11-17 23:12:16 -03:00
)
)
panels . append (
pie_panel (
2025-11-17 23:42:55 -03:00
12 ,
2025-12-02 14:41:39 -03:00
" Namespace GPU Share " ,
2026-01-01 14:44:33 -03:00
namespace_gpu_share_expr ( gpu_scope ) ,
2026-04-04 01:33:15 -03:00
{ " h " : 9 , " w " : 8 , " x " : 8 , " y " : 23 } ,
2026-01-01 14:44:33 -03:00
links = namespace_scope_links ( " namespace_scope_gpu " ) ,
2026-01-18 02:50:07 -03:00
description = " Shares are normalized within the selected filter. Switching scope changes the denominator. " ,
2025-11-18 00:11:39 -03:00
)
)
panels . append (
pie_panel (
13 ,
2025-12-02 14:41:39 -03:00
" Namespace RAM Share " ,
2026-01-01 14:44:33 -03:00
namespace_ram_share_expr ( ram_scope ) ,
2026-04-04 01:33:15 -03:00
{ " h " : 9 , " w " : 8 , " x " : 16 , " y " : 23 } ,
2026-01-01 14:44:33 -03:00
links = namespace_scope_links ( " namespace_scope_ram " ) ,
2026-01-18 02:50:07 -03:00
description = " Shares are normalized within the selected filter. Switching scope changes the denominator. " ,
2025-11-17 14:22:46 -03:00
)
)
2025-11-17 21:48:12 -03:00
worker_filter = f " { WORKER_REGEX } "
2025-11-17 14:22:46 -03:00
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
14 ,
2025-12-02 14:41:39 -03:00
" Worker Node CPU " ,
2025-11-17 21:48:12 -03:00
node_cpu_expr ( worker_filter ) ,
2026-05-12 04:19:36 -03:00
{ " h " : 12 , " w " : 12 , " x " : 0 , " y " : 44 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-nodes " ) ,
2025-11-17 14:22:46 -03:00
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
15 ,
2025-12-02 14:41:39 -03:00
" Worker Node RAM " ,
2025-11-17 21:48:12 -03:00
node_mem_expr ( worker_filter ) ,
2026-05-12 04:19:36 -03:00
{ " h " : 12 , " w " : 12 , " x " : 12 , " y " : 44 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-nodes " ) ,
2025-11-17 14:22:46 -03:00
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
16 ,
2025-11-17 21:48:12 -03:00
" Control plane CPU " ,
2025-12-12 21:55:53 -03:00
node_cpu_expr ( CONTROL_ALL_REGEX ) ,
2026-05-12 04:19:36 -03:00
{ " h " : 10 , " w " : 12 , " x " : 0 , " y " : 56 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
2025-11-17 16:27:38 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 14:22:46 -03:00
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
17 ,
2025-11-17 21:48:12 -03:00
" Control plane RAM " ,
2025-12-12 21:55:53 -03:00
node_mem_expr ( CONTROL_ALL_REGEX ) ,
2026-05-12 04:19:36 -03:00
{ " h " : 10 , " w " : 12 , " x " : 12 , " y " : 56 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
2025-11-17 16:27:38 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 14:22:46 -03:00
)
)
2025-12-12 18:51:43 -03:00
panels . append (
pie_panel (
28 ,
2025-12-12 20:30:00 -03:00
" Node Pod Share " ,
2025-12-12 20:40:32 -03:00
' (sum(kube_pod_info { pod!= " " , node!= " " }) by (node) / clamp_min(sum(kube_pod_info { pod!= " " , node!= " " }), 1)) * 100 ' ,
2026-05-12 04:19:36 -03:00
{ " h " : 10 , " w " : 12 , " x " : 0 , " y " : 66 } ,
2025-12-12 18:51:43 -03:00
)
)
panels . append (
bargauge_panel (
29 ,
" Top Nodes by Pod Count " ,
2025-12-12 19:09:51 -03:00
' topk(12, sum(kube_pod_info { pod!= " " , node!= " " }) by (node)) ' ,
2026-05-12 04:19:36 -03:00
{ " h " : 10 , " w " : 12 , " x " : 12 , " y " : 66 } ,
2025-12-12 18:51:43 -03:00
unit = " none " ,
2025-12-12 18:56:13 -03:00
limit = 12 ,
2025-12-12 20:20:13 -03:00
decimals = 0 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 75 } ,
{ " color " : " red " , " value " : 100 } ,
] ,
} ,
2025-12-12 20:30:00 -03:00
instant = True ,
2026-05-12 04:19:36 -03:00
include_color = False ,
2025-12-12 18:51:43 -03:00
)
)
2025-11-17 14:22:46 -03:00
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
18 ,
2025-12-02 14:41:39 -03:00
" Cluster Ingress Throughput " ,
2025-11-17 16:27:38 -03:00
NET_INGRESS_EXPR ,
2026-05-12 04:19:36 -03:00
{ " h " : 7 , " w " : 8 , " x " : 0 , " y " : 37 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-18 14:08:33 -03:00
legend = " Ingress (Traefik) " ,
2025-11-17 16:27:38 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-network " ) ,
2025-11-17 16:27:38 -03:00
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
19 ,
2025-12-02 14:41:39 -03:00
" Cluster Egress Throughput " ,
2025-11-17 16:27:38 -03:00
NET_EGRESS_EXPR ,
2026-05-12 04:19:36 -03:00
{ " h " : 7 , " w " : 8 , " x " : 8 , " y " : 37 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-18 14:08:33 -03:00
legend = " Egress (Traefik) " ,
2025-11-17 16:27:38 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-network " ) ,
2025-11-17 16:27:38 -03:00
)
)
panels . append (
timeseries_panel (
2025-11-18 00:11:39 -03:00
20 ,
2025-12-02 14:41:39 -03:00
" Intra-Cluster Throughput " ,
2025-11-18 14:08:33 -03:00
NET_INTERNAL_EXPR ,
2026-05-12 04:19:36 -03:00
{ " h " : 7 , " w " : 8 , " x " : 16 , " y " : 37 } ,
2025-11-18 14:08:33 -03:00
unit = " Bps " ,
legend = " Internal traffic " ,
legend_display = " list " ,
legend_placement = " bottom " ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-network " ) ,
2025-11-18 14:08:33 -03:00
)
)
panels . append (
timeseries_panel (
21 ,
2025-12-02 14:41:39 -03:00
" Root Filesystem Usage " ,
2025-11-17 14:22:46 -03:00
root_usage_expr ( ) ,
2026-05-12 04:19:36 -03:00
{ " h " : 16 , " w " : 12 , " x " : 0 , " y " : 76 } ,
2025-11-17 14:22:46 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 16:27:38 -03:00
time_from = " 30d " ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-storage " ) ,
2025-11-17 14:22:46 -03:00
)
)
panels . append (
2026-04-12 04:26:52 -03:00
timeseries_panel (
2025-12-02 13:16:00 -03:00
22 ,
2026-04-11 11:54:43 -03:00
" Nodes Closest to Full Astraios Disks " ,
2026-04-12 04:26:52 -03:00
astraios_usage_expr ( ) ,
2026-05-12 04:19:36 -03:00
{ " h " : 16 , " w " : 12 , " x " : 12 , " y " : 76 } ,
2025-12-02 13:16:00 -03:00
unit = " percent " ,
2026-04-12 04:26:52 -03:00
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
time_from = " 1w " ,
2026-05-12 04:19:36 -03:00
links = overview_link ( " atlas-storage " ) ,
2025-12-02 13:16:00 -03:00
)
2025-11-17 14:22:46 -03:00
)
return {
" uid " : " atlas-overview " ,
" title " : " Atlas Overview " ,
2025-11-17 16:27:38 -03:00
" folderUid " : PUBLIC_FOLDER ,
2025-11-17 14:22:46 -03:00
" editable " : False ,
2025-11-17 16:27:38 -03:00
" annotations " : { " list " : [ ] } ,
2025-11-17 14:22:46 -03:00
" panels " : panels ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " overview " ] ,
2026-01-01 14:44:33 -03:00
" templating " : {
" list " : [
namespace_scope_variable ( " namespace_scope_cpu " , " CPU namespace filter " ) ,
namespace_scope_variable ( " namespace_scope_gpu " , " GPU namespace filter " ) ,
namespace_scope_variable ( " namespace_scope_ram " , " RAM namespace filter " ) ,
]
} ,
2025-12-02 14:41:39 -03:00
" time " : { " from " : " now-1h " , " to " : " now " } ,
" refresh " : " 1m " ,
2026-05-16 02:56:52 -03:00
" links " : link_to ( " atlas-testing " ) ,
2025-11-17 14:22:46 -03:00
}
def build_pods_dashboard ( ) :
panels = [ ]
panels . append (
2025-11-17 16:27:38 -03:00
stat_panel (
2025-11-17 14:22:46 -03:00
1 ,
2025-12-02 14:41:39 -03:00
" Problem Pods " ,
2025-11-17 16:27:38 -03:00
PROBLEM_PODS_EXPR ,
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
2 ,
" CrashLoop / ImagePull " ,
CRASHLOOP_EXPR ,
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
3 ,
2025-12-02 14:41:39 -03:00
" Stuck Terminating (>10m) " ,
2025-11-17 16:27:38 -03:00
STUCK_TERMINATING_EXPR ,
{ " h " : 4 , " w " : 6 , " x " : 12 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
4 ,
2025-12-02 14:41:39 -03:00
" Control Plane Workloads " ,
2025-11-17 16:27:38 -03:00
f ' sum(kube_pod_info {{ node=~ " { CONTROL_REGEX } " ,namespace!~ " { CP_ALLOWED_NS } " }} ) ' ,
{ " h " : 4 , " w " : 6 , " x " : 18 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
)
)
panels . append (
table_panel (
5 ,
2025-12-02 14:41:39 -03:00
" Pods Not Running " ,
2025-11-17 16:27:38 -03:00
PROBLEM_TABLE_EXPR ,
{ " h " : 10 , " w " : 24 , " x " : 0 , " y " : 4 } ,
2025-11-17 14:22:46 -03:00
unit = " s " ,
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
panels . append (
table_panel (
2025-11-17 16:27:38 -03:00
6 ,
2025-11-17 14:22:46 -03:00
" CrashLoop / ImagePull " ,
2025-11-17 16:27:38 -03:00
CRASHLOOP_TABLE_EXPR ,
{ " h " : 10 , " w " : 24 , " x " : 0 , " y " : 14 } ,
2025-11-17 14:22:46 -03:00
unit = " s " ,
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
panels . append (
table_panel (
2025-11-17 16:27:38 -03:00
7 ,
" Terminating >10m " ,
STUCK_TABLE_EXPR ,
{ " h " : 10 , " w " : 24 , " x " : 0 , " y " : 24 } ,
2025-11-17 14:22:46 -03:00
unit = " s " ,
transformations = [
2025-11-17 16:27:38 -03:00
{ " id " : " labelsToFields " , " options " : { } } ,
2025-11-17 14:22:46 -03:00
{ " id " : " filterByValue " , " options " : { " match " : " Value " , " operator " : " gt " , " value " : 600 } } ,
] ,
)
)
2025-12-12 18:32:45 -03:00
panels . append (
pie_panel (
8 ,
2025-12-12 20:30:00 -03:00
" Node Pod Share " ,
2025-12-12 20:40:32 -03:00
' (sum(kube_pod_info { pod!= " " , node!= " " }) by (node) / clamp_min(sum(kube_pod_info { pod!= " " , node!= " " }), 1)) * 100 ' ,
2025-12-12 18:32:45 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 34 } ,
)
)
2025-12-12 18:45:29 -03:00
panels . append (
bargauge_panel (
9 ,
" Top Nodes by Pod Count " ,
2025-12-12 19:09:51 -03:00
' topk(12, sum(kube_pod_info { pod!= " " , node!= " " }) by (node)) ' ,
2025-12-12 18:45:29 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 34 } ,
unit = " none " ,
2025-12-12 18:56:13 -03:00
limit = 12 ,
2025-12-12 20:20:13 -03:00
decimals = 0 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 75 } ,
{ " color " : " red " , " value " : 100 } ,
] ,
} ,
2025-12-12 20:30:00 -03:00
instant = True ,
2025-12-12 18:45:29 -03:00
)
)
2025-12-13 15:51:45 -03:00
2025-12-13 16:36:25 -03:00
share_expr = (
2025-12-13 18:23:19 -03:00
' (sum by (namespace,node) (kube_pod_info { pod!= " " , node!= " " }) '
2025-12-13 17:29:55 -03:00
' / on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info { pod!= " " }), 1) * 100) '
2025-12-13 16:36:25 -03:00
)
2025-12-13 19:04:22 -03:00
rank_terms = [
f " (sum by (node) (kube_node_info {{ node= \" { node } \" }} ) * 0 + { idx * 1e-3 } ) "
2025-12-13 18:23:19 -03:00
for idx , node in enumerate ( CONTROL_ALL + WORKER_NODES , start = 1 )
2025-12-13 19:04:22 -03:00
]
rank_expr = " or " . join ( rank_terms )
2025-12-13 18:23:19 -03:00
score_expr = f " { share_expr } + on(node) group_left() ( { rank_expr } ) "
2025-12-13 17:29:55 -03:00
mask_expr = (
2025-12-13 18:39:31 -03:00
f " { score_expr } == bool on(namespace) group_left() "
f " (max by (namespace) ( { score_expr } )) "
2025-12-13 17:19:03 -03:00
)
2025-12-13 03:57:20 -03:00
panels . append (
table_panel (
10 ,
2025-12-13 22:17:47 -03:00
" Namespace Plurality by Node v27 " ,
2025-12-13 17:19:03 -03:00
(
2025-12-13 17:29:55 -03:00
f " { share_expr } * on(namespace,node) group_left() "
2025-12-13 18:25:03 -03:00
f " ( { mask_expr } ) "
2025-12-13 17:19:03 -03:00
) ,
2025-12-13 03:57:20 -03:00
{ " h " : 8 , " w " : 24 , " x " : 0 , " y " : 42 } ,
unit = " percent " ,
2025-12-13 18:23:19 -03:00
transformations = [
{ " id " : " labelsToFields " , " options " : { } } ,
{ " id " : " organize " , " options " : { " excludeByName " : { " Time " : True } } } ,
2025-12-13 18:25:03 -03:00
{ " id " : " filterByValue " , " options " : { " match " : " Value " , " operator " : " gt " , " value " : 0 } } ,
2025-12-13 18:23:19 -03:00
{
" id " : " sortBy " ,
2025-12-13 22:17:47 -03:00
" options " : { " fields " : [ " Value " ] , " order " : " desc " } ,
} ,
{
" id " : " groupBy " ,
" options " : {
" fields " : {
" namespace " : {
" aggregations " : [
{ " field " : " Value " , " operation " : " max " } ,
{ " field " : " node " , " operation " : " first " } ,
]
}
} ,
" rowBy " : [ " namespace " ] ,
} ,
2025-12-13 18:23:19 -03:00
} ,
] ,
2025-12-13 04:00:57 -03:00
instant = True ,
2025-12-13 17:32:19 -03:00
options = { " showColumnFilters " : False } ,
2025-12-13 17:55:52 -03:00
filterable = False ,
2025-12-13 18:03:51 -03:00
footer = { " show " : False , " fields " : " " , " calcs " : [ ] } ,
2025-12-13 18:23:19 -03:00
format = " table " ,
2025-12-13 03:57:20 -03:00
)
)
2025-12-13 15:51:45 -03:00
2025-11-17 14:22:46 -03:00
return {
" uid " : " atlas-pods " ,
" title " : " Atlas Pods " ,
2025-11-17 16:27:38 -03:00
" folderUid " : PRIVATE_FOLDER ,
2025-11-17 14:22:46 -03:00
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " pods " ] ,
}
def build_nodes_dashboard ( ) :
panels = [ ]
2025-11-17 16:27:38 -03:00
panels . append (
stat_panel (
1 ,
2025-12-02 14:41:39 -03:00
" Worker Nodes Ready " ,
2025-11-17 16:27:38 -03:00
f ' sum(kube_node_status_condition {{ condition= " Ready " ,status= " true " ,node=~ " { WORKER_REGEX } " }} ) ' ,
{ " h " : 4 , " w " : 8 , " x " : 0 , " y " : 0 } ,
value_suffix = WORKER_SUFFIX ,
)
)
panels . append (
stat_panel (
2 ,
2025-12-02 14:41:39 -03:00
" Control Plane Ready " ,
2025-11-17 16:27:38 -03:00
f ' sum(kube_node_status_condition {{ condition= " Ready " ,status= " true " ,node=~ " { CONTROL_REGEX } " }} ) ' ,
{ " h " : 4 , " w " : 8 , " x " : 8 , " y " : 0 } ,
value_suffix = CONTROL_SUFFIX ,
)
)
panels . append (
stat_panel (
3 ,
2025-12-02 14:41:39 -03:00
" Control Plane Workloads " ,
2025-11-17 16:27:38 -03:00
f ' sum(kube_pod_info {{ node=~ " { CONTROL_REGEX } " ,namespace!~ " { CP_ALLOWED_NS } " }} ) ' ,
{ " h " : 4 , " w " : 8 , " x " : 16 , " y " : 0 } ,
)
)
2025-12-12 18:00:43 -03:00
panels . append (
stat_panel (
9 ,
" API Server 5xx rate " ,
APISERVER_5XX_RATE ,
{ " h " : 4 , " w " : 8 , " x " : 0 , " y " : 4 } ,
unit = " req/s " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 0.05 } ,
{ " color " : " orange " , " value " : 0.2 } ,
{ " color " : " red " , " value " : 0.5 } ,
] ,
} ,
decimals = 3 ,
)
)
panels . append (
stat_panel (
10 ,
" API Server P99 latency " ,
APISERVER_P99_LATENCY_MS ,
{ " h " : 4 , " w " : 8 , " x " : 8 , " y " : 4 } ,
unit = " ms " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 250 } ,
{ " color " : " orange " , " value " : 400 } ,
{ " color " : " red " , " value " : 600 } ,
] ,
} ,
decimals = 1 ,
)
)
panels . append (
stat_panel (
11 ,
" etcd P99 latency " ,
ETCD_P99_LATENCY_MS ,
{ " h " : 4 , " w " : 8 , " x " : 16 , " y " : 4 } ,
unit = " ms " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 50 } ,
{ " color " : " orange " , " value " : 100 } ,
{ " color " : " red " , " value " : 200 } ,
] ,
} ,
decimals = 1 ,
)
)
2025-11-17 16:27:38 -03:00
panels . append (
timeseries_panel (
4 ,
" Node CPU " ,
node_cpu_expr ( ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 24 , " x " : 0 , " y " : 8 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
5 ,
" Node RAM " ,
node_mem_expr ( ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 24 , " x " : 0 , " y " : 17 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_calcs = [ " last " ] ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
6 ,
2025-12-02 14:41:39 -03:00
" Control Plane (incl. titan-db) CPU " ,
2025-11-17 16:27:38 -03:00
node_cpu_expr ( CONTROL_ALL_REGEX ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 26 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
7 ,
2025-12-02 14:41:39 -03:00
" Control Plane (incl. titan-db) RAM " ,
2025-11-17 16:27:38 -03:00
node_mem_expr ( CONTROL_ALL_REGEX ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 26 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
8 ,
2025-12-02 14:41:39 -03:00
" Root Filesystem Usage " ,
2025-11-17 16:27:38 -03:00
root_usage_expr ( ) ,
2025-12-12 18:00:43 -03:00
{ " h " : 9 , " w " : 24 , " x " : 0 , " y " : 35 } ,
2025-11-17 16:27:38 -03:00
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
2026-04-11 11:54:43 -03:00
time_from = " 30d " ,
)
)
panels . append (
timeseries_panel (
9 ,
" Astraios Usage " ,
astraios_usage_expr ( ) ,
{ " h " : 9 , " w " : 24 , " x " : 0 , " y " : 44 } ,
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
2025-11-17 16:27:38 -03:00
time_from = " 30d " ,
)
)
2025-11-17 14:22:46 -03:00
return {
" uid " : " atlas-nodes " ,
" title " : " Atlas Nodes " ,
2025-11-17 16:27:38 -03:00
" folderUid " : PRIVATE_FOLDER ,
2025-11-17 14:22:46 -03:00
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " nodes " ] ,
}
def build_storage_dashboard ( ) :
panels = [ ]
2025-11-17 16:27:38 -03:00
panels . append (
stat_panel (
1 ,
2025-12-02 14:41:39 -03:00
" Astreae Usage " ,
2025-11-17 16:27:38 -03:00
astreae_usage_expr ( " /mnt/astreae " ) ,
{ " h " : 5 , " w " : 6 , " x " : 0 , " y " : 0 } ,
unit = " percent " ,
thresholds = PERCENT_THRESHOLDS ,
)
)
panels . append (
stat_panel (
2 ,
2025-12-02 14:41:39 -03:00
" Asteria Usage " ,
2025-11-17 16:27:38 -03:00
astreae_usage_expr ( " /mnt/asteria " ) ,
{ " h " : 5 , " w " : 6 , " x " : 6 , " y " : 0 } ,
unit = " percent " ,
thresholds = PERCENT_THRESHOLDS ,
)
)
panels . append (
stat_panel (
3 ,
2025-12-02 14:41:39 -03:00
" Astreae Free " ,
2025-11-17 16:27:38 -03:00
astreae_free_expr ( " /mnt/astreae " ) ,
{ " h " : 5 , " w " : 6 , " x " : 12 , " y " : 0 } ,
2025-11-17 18:55:11 -03:00
unit = " decbytes " ,
2025-11-17 16:27:38 -03:00
)
)
panels . append (
stat_panel (
4 ,
2025-12-02 14:41:39 -03:00
" Asteria Free " ,
2025-11-17 16:27:38 -03:00
astreae_free_expr ( " /mnt/asteria " ) ,
{ " h " : 5 , " w " : 6 , " x " : 18 , " y " : 0 } ,
2025-11-17 18:55:11 -03:00
unit = " decbytes " ,
2025-11-17 16:27:38 -03:00
)
)
panels . append (
timeseries_panel (
5 ,
2025-12-02 14:41:39 -03:00
" Astreae Per-Node Usage " ,
2025-11-17 18:55:11 -03:00
filesystem_usage_expr ( " /mnt/astreae " , LONGHORN_NODE_REGEX ) ,
2025-11-17 16:27:38 -03:00
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 5 } ,
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
time_from = " 30d " ,
)
)
panels . append (
timeseries_panel (
6 ,
2025-12-02 14:41:39 -03:00
" Asteria Per-Node Usage " ,
2025-11-17 18:55:11 -03:00
filesystem_usage_expr ( " /mnt/asteria " , LONGHORN_NODE_REGEX ) ,
2025-11-17 16:27:38 -03:00
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 5 } ,
unit = " percent " ,
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
time_from = " 30d " ,
)
)
panels . append (
timeseries_panel (
7 ,
2025-12-02 14:41:39 -03:00
" Astreae Usage History " ,
2025-11-17 16:27:38 -03:00
astreae_usage_expr ( " /mnt/astreae " ) ,
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 14 } ,
unit = " percent " ,
time_from = " 90d " ,
)
)
panels . append (
timeseries_panel (
8 ,
2025-12-02 14:41:39 -03:00
" Asteria Usage History " ,
2025-11-17 16:27:38 -03:00
astreae_usage_expr ( " /mnt/asteria " ) ,
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 14 } ,
unit = " percent " ,
time_from = " 90d " ,
)
)
2026-01-11 23:46:24 -03:00
panels . append (
stat_panel (
30 ,
" Maintenance Sweepers Ready " ,
' kube_daemonset_status_number_ready { namespace= " maintenance " ,daemonset= " node-image-sweeper " } / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled { namespace= " maintenance " ,daemonset= " node-image-sweeper " } * 100 ' ,
{ " h " : 4 , " w " : 12 , " x " : 0 , " y " : 44 } ,
unit = " percent " ,
thresholds = PERCENT_THRESHOLDS ,
)
)
panels . append (
stat_panel (
31 ,
" Maintenance Cron Freshness (s) " ,
2026-01-14 06:41:34 -03:00
' time() - max by (cronjob) (kube_cronjob_status_last_successful_time { namespace= " maintenance " ,cronjob= " image-sweeper " }) ' ,
2026-01-11 23:46:24 -03:00
{ " h " : 4 , " w " : 12 , " x " : 12 , " y " : 44 } ,
unit = " s " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 3600 } ,
{ " color " : " red " , " value " : 10800 } ,
] ,
} ,
)
)
2025-11-17 14:22:46 -03:00
return {
" uid " : " atlas-storage " ,
" title " : " Atlas Storage " ,
2025-11-17 16:27:38 -03:00
" folderUid " : PRIVATE_FOLDER ,
2025-11-17 14:22:46 -03:00
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " storage " ] ,
}
2025-11-17 16:27:38 -03:00
def build_network_dashboard ( ) :
panels = [ ]
panels . append (
2025-12-02 13:16:00 -03:00
stat_panel (
1 ,
2025-12-12 18:00:43 -03:00
" Ingress Success Rate (5m) " ,
TRAEFIK_SLI_5M ,
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 0 } ,
unit = " percentunit " ,
decimals = 2 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " orange " , " value " : 0.995 } ,
{ " color " : " yellow " , " value " : 0.999 } ,
{ " color " : " green " , " value " : 0.9995 } ,
] ,
} ,
)
)
panels . append (
stat_panel (
2 ,
" Error Budget Burn (1h) " ,
traefik_burn ( " 1h " ) ,
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 4 } ,
] ,
} ,
decimals = 2 ,
)
)
panels . append (
stat_panel (
3 ,
" Error Budget Burn (6h) " ,
traefik_burn ( " 6h " ) ,
{ " h " : 4 , " w " : 6 , " x " : 12 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 1 } ,
{ " color " : " orange " , " value " : 2 } ,
{ " color " : " red " , " value " : 4 } ,
] ,
} ,
decimals = 2 ,
)
)
panels . append (
stat_panel (
4 ,
" Edge P99 Latency (ms) " ,
TRAEFIK_P99_LATENCY_MS ,
{ " h " : 4 , " w " : 6 , " x " : 18 , " y " : 0 } ,
unit = " ms " ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 200 } ,
{ " color " : " orange " , " value " : 350 } ,
{ " color " : " red " , " value " : 500 } ,
] ,
} ,
decimals = 1 ,
)
)
panels . append (
stat_panel (
5 ,
2025-12-02 14:41:39 -03:00
" Ingress Traffic " ,
2025-12-02 13:16:00 -03:00
NET_INGRESS_EXPR ,
2025-12-12 18:00:43 -03:00
{ " h " : 4 , " w " : 8 , " x " : 0 , " y " : 4 } ,
2025-12-02 13:16:00 -03:00
unit = " Bps " ,
)
2025-11-17 16:27:38 -03:00
)
panels . append (
2025-12-02 13:16:00 -03:00
stat_panel (
2025-12-12 18:00:43 -03:00
6 ,
2025-12-02 14:41:39 -03:00
" Egress Traffic " ,
2025-12-02 13:16:00 -03:00
NET_EGRESS_EXPR ,
2025-12-12 18:00:43 -03:00
{ " h " : 4 , " w " : 8 , " x " : 8 , " y " : 4 } ,
2025-12-02 13:16:00 -03:00
unit = " Bps " ,
)
2025-11-17 16:27:38 -03:00
)
2025-11-18 14:08:33 -03:00
panels . append (
2025-12-02 13:16:00 -03:00
stat_panel (
2025-12-12 18:00:43 -03:00
7 ,
2025-12-02 14:41:39 -03:00
" Intra-Cluster Traffic " ,
2025-12-02 13:16:00 -03:00
NET_INTERNAL_EXPR ,
2025-12-12 18:00:43 -03:00
{ " h " : 4 , " w " : 8 , " x " : 16 , " y " : 4 } ,
2025-12-02 13:16:00 -03:00
unit = " Bps " ,
)
2025-11-18 14:08:33 -03:00
)
2025-11-17 16:27:38 -03:00
panels . append (
timeseries_panel (
2025-12-12 18:00:43 -03:00
8 ,
2025-12-02 14:41:39 -03:00
" Per-Node Throughput " ,
2025-12-02 13:16:00 -03:00
f ' avg by (node) (( { NET_NODE_TX_PHYS } + { NET_NODE_RX_PHYS } ) * on(instance) group_left(node) { NODE_INFO } ) ' ,
2025-11-18 14:08:33 -03:00
{ " h " : 8 , " w " : 24 , " x " : 0 , " y " : 8 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-17 16:27:38 -03:00
legend = " {{ node}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
table_panel (
2025-12-12 18:00:43 -03:00
9 ,
2025-12-02 14:41:39 -03:00
" Top Namespaces " ,
2025-11-17 16:27:38 -03:00
' topk(10, sum(rate(container_network_transmit_bytes_total { namespace!= " " }[5m]) '
' + rate(container_network_receive_bytes_total { namespace!= " " }[5m])) by (namespace)) ' ,
2025-11-18 14:08:33 -03:00
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 16 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-17 16:27:38 -03:00
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
panels . append (
table_panel (
2025-12-12 18:00:43 -03:00
10 ,
2025-12-02 14:41:39 -03:00
" Top Pods " ,
2025-11-17 16:27:38 -03:00
' topk(10, sum(rate(container_network_transmit_bytes_total { pod!= " " }[5m]) '
' + rate(container_network_receive_bytes_total { pod!= " " }[5m])) by (namespace,pod)) ' ,
2025-11-18 14:08:33 -03:00
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 16 } ,
2025-11-17 18:55:11 -03:00
unit = " Bps " ,
2025-11-17 16:27:38 -03:00
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
panels . append (
timeseries_panel (
2025-12-12 18:00:43 -03:00
11 ,
2025-12-02 14:41:39 -03:00
" Traefik Routers (req/s) " ,
2025-11-17 18:55:11 -03:00
f " topk(10, { TRAEFIK_ROUTER_EXPR } ) " ,
2025-11-18 14:08:33 -03:00
{ " h " : 9 , " w " : 12 , " x " : 0 , " y " : 25 } ,
2025-11-17 16:27:38 -03:00
unit = " req/s " ,
legend = " {{ router}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
2025-12-12 18:00:43 -03:00
12 ,
2025-12-02 14:41:39 -03:00
" Traefik Entrypoints (req/s) " ,
2025-11-17 16:27:38 -03:00
' sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m])) ' ,
2025-11-18 14:08:33 -03:00
{ " h " : 9 , " w " : 12 , " x " : 12 , " y " : 25 } ,
2025-11-17 16:27:38 -03:00
unit = " req/s " ,
legend = " {{ entrypoint}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
return {
" uid " : " atlas-network " ,
" title " : " Atlas Network " ,
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " network " ] ,
}
2026-01-05 21:55:59 -03:00
def build_mail_dashboard ( ) :
panels = [ ]
bounce_rate_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 5 } ,
{ " color " : " orange " , " value " : 8 } ,
{ " color " : " red " , " value " : 10 } ,
] ,
}
2026-01-06 14:38:10 -03:00
limit_thresholds = {
2026-01-05 21:55:59 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
2026-01-06 14:38:10 -03:00
{ " color " : " yellow " , " value " : 70 } ,
{ " color " : " orange " , " value " : 85 } ,
{ " color " : " red " , " value " : 95 } ,
2026-01-05 21:55:59 -03:00
] ,
}
2026-01-06 14:38:10 -03:00
success_thresholds = {
2026-01-05 21:55:59 -03:00
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
2026-01-06 14:38:10 -03:00
{ " color " : " orange " , " value " : 90 } ,
{ " color " : " yellow " , " value " : 95 } ,
{ " color " : " green " , " value " : 98 } ,
2026-01-06 02:06:20 -03:00
] ,
}
2026-01-05 21:55:59 -03:00
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
1 ,
2026-01-05 21:55:59 -03:00
" Sent (1d) " ,
2026-01-06 02:06:20 -03:00
' max(postmark_outbound_sent { window= " 1d " }) ' ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 0 } ,
2026-01-05 21:55:59 -03:00
decimals = 0 ,
)
)
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
2 ,
2026-01-05 21:55:59 -03:00
" Sent (7d) " ,
2026-01-06 02:06:20 -03:00
' max(postmark_outbound_sent { window= " 7d " }) ' ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 0 } ,
2026-01-05 21:55:59 -03:00
decimals = 0 ,
)
)
2026-01-06 14:38:10 -03:00
panels . append (
{
" id " : 3 ,
" type " : " stat " ,
" title " : " Mail Bounces (1d) " ,
" datasource " : PROM_DS ,
" gridPos " : { " h " : 4 , " w " : 6 , " x " : 12 , " y " : 0 } ,
" targets " : [
{
" expr " : ' max(postmark_outbound_bounce_rate { window= " 1d " }) ' ,
" refId " : " A " ,
" legendFormat " : " Rate " ,
} ,
{
" expr " : ' max(postmark_outbound_bounced { window= " 1d " }) ' ,
" refId " : " B " ,
" legendFormat " : " Count " ,
} ,
] ,
" fieldConfig " : {
" defaults " : {
" color " : { " mode " : " thresholds " } ,
" custom " : { " displayMode " : " auto " } ,
" thresholds " : bounce_rate_thresholds ,
" unit " : " none " ,
} ,
" overrides " : [
{
" matcher " : { " id " : " byName " , " options " : " Rate " } ,
" properties " : [ { " id " : " unit " , " value " : " percent " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : " Count " } ,
" properties " : [ { " id " : " unit " , " value " : " none " } ] ,
} ,
] ,
} ,
" options " : {
" colorMode " : " value " ,
" graphMode " : " area " ,
" justifyMode " : " center " ,
" reduceOptions " : { " calcs " : [ " lastNotNull " ] , " fields " : " " , " values " : False } ,
" textMode " : " name_and_value " ,
} ,
}
)
2026-01-05 21:55:59 -03:00
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
4 ,
" Success Rate (1d) " ,
' clamp_min(100 - max(postmark_outbound_bounce_rate { window= " 1d " }), 0) ' ,
{ " h " : 4 , " w " : 6 , " x " : 18 , " y " : 0 } ,
unit = " percent " ,
thresholds = success_thresholds ,
decimals = 1 ,
)
)
panels . append (
stat_panel (
5 ,
2026-01-06 02:06:20 -03:00
" Limit Used (30d) " ,
" max(postmark_sending_limit_used_percent) " ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 0 , " y " : 4 } ,
2026-01-06 02:06:20 -03:00
thresholds = limit_thresholds ,
unit = " percent " ,
decimals = 1 ,
)
)
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
6 ,
2026-01-06 02:06:20 -03:00
" Send Limit (30d) " ,
" max(postmark_sending_limit) " ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 6 , " y " : 4 } ,
2026-01-05 21:55:59 -03:00
decimals = 0 ,
)
)
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
7 ,
2026-01-05 21:55:59 -03:00
" Last Success " ,
2026-01-06 02:06:20 -03:00
" max(postmark_last_success_timestamp_seconds) " ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 12 , " y " : 4 } ,
2026-01-05 21:55:59 -03:00
unit = " dateTimeAsIso " ,
decimals = 0 ,
)
)
2026-01-06 02:06:20 -03:00
panels . append (
stat_panel (
2026-01-06 14:38:10 -03:00
8 ,
2026-01-06 02:06:20 -03:00
" Exporter Errors " ,
" sum(postmark_request_errors_total) " ,
2026-01-06 14:38:10 -03:00
{ " h " : 4 , " w " : 6 , " x " : 18 , " y " : 4 } ,
2026-01-06 02:06:20 -03:00
decimals = 0 ,
)
)
2026-01-05 21:55:59 -03:00
panels . append (
timeseries_panel (
2026-01-06 02:06:20 -03:00
13 ,
2026-01-05 21:55:59 -03:00
" Bounce Rate (1d vs 7d) " ,
2026-01-06 02:06:20 -03:00
" max by (window) (postmark_outbound_bounce_rate) " ,
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 12 } ,
2026-01-05 21:55:59 -03:00
unit = " percent " ,
legend = " {{ window}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
2026-01-06 02:06:20 -03:00
14 ,
2026-01-05 21:55:59 -03:00
" Bounced (1d vs 7d) " ,
2026-01-06 02:06:20 -03:00
" max by (window) (postmark_outbound_bounced) " ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 12 } ,
2026-01-05 21:55:59 -03:00
unit = " none " ,
legend = " {{ window}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
2026-01-06 02:06:20 -03:00
15 ,
2026-01-05 21:55:59 -03:00
" Sent (1d vs 7d) " ,
2026-01-06 02:06:20 -03:00
" max by (window) (postmark_outbound_sent) " ,
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 20 } ,
2026-01-05 21:55:59 -03:00
unit = " none " ,
legend = " {{ window}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
2026-01-06 02:06:20 -03:00
16 ,
2026-01-05 21:55:59 -03:00
" Exporter Errors " ,
2026-01-06 02:06:20 -03:00
" sum(postmark_request_errors_total) " ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 20 } ,
2026-01-05 21:55:59 -03:00
unit = " none " ,
)
)
return {
" uid " : " atlas-mail " ,
2026-04-19 14:18:41 -03:00
" title " : " Atlas Mail " ,
2026-04-12 20:05:39 -03:00
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
2026-04-19 14:18:41 -03:00
" time " : { " from " : " now-30d " , " to " : " now " } ,
2026-04-12 20:05:39 -03:00
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
2026-04-19 14:18:41 -03:00
" tags " : [ " atlas " , " mail " ] ,
2026-04-12 20:05:39 -03:00
}
2026-04-19 14:18:41 -03:00
def build_jobs_dashboard ( ) :
2026-04-12 20:05:39 -03:00
panels = [ ]
2026-04-19 23:22:34 -03:00
suite_var = " $ { suite:regex} "
2026-04-20 08:35:05 -03:00
test_var = " $ { test:regex} "
2026-04-21 09:35:43 -03:00
branch_var = " $ { branch:regex} "
2026-04-18 17:47:06 -03:00
success = PLATFORM_TEST_SUCCESS_STATUS
2026-04-19 14:18:41 -03:00
exported = PLATFORM_TEST_EXPORT_FILTER
runs_selector = f ' suite=~ " { suite_var } " , { exported } '
runs_success_selector = f ' { runs_selector } ,status=~ " { success } " '
runs_failure_selector = f ' { runs_selector } ,status!~ " { success } " '
checks_selector = f ' __name__=~ " .*_quality_gate_checks_total " ,suite=~ " { suite_var } " , { exported } '
coverage_metric_selector = f ' __name__=~ " .*_quality_gate_coverage_percent " ,suite=~ " { suite_var } " , { exported } '
workspace_coverage_selector = f ' suite=~ " { suite_var } " , { exported } '
smell_selector = f ' suite=~ " { suite_var } " , { exported } '
2026-05-15 21:05:13 -03:00
build_info_selector = f ' suite=~ " { suite_var } " ,branch!= " " ,branch=~ " { branch_var } " , { exported } '
2026-04-22 12:42:33 -03:00
selected_suite_universe = (
2026-05-16 02:21:05 -03:00
f ' (count by (suite) (platform_quality_gate_build_info {{ { build_info_selector } }} ) >= bool 0) '
2026-04-22 12:42:33 -03:00
)
selected_suite_zero = f " (0 * { selected_suite_universe } ) "
2026-04-19 14:18:41 -03:00
suite_universe = " or " . join (
f ' label_replace(vector(1), " suite " , " { suite } " , " __name__ " , " .* " ) '
for suite in PLATFORM_TEST_SUITE_NAMES
2026-04-18 17:47:06 -03:00
)
2026-04-19 14:18:41 -03:00
runs_24h = f ' (sum(increase(platform_quality_gate_runs_total {{ { runs_selector } }} [24h])) or on() vector(0)) '
runs_30d = f ' (sum(increase(platform_quality_gate_runs_total {{ { runs_selector } }} [30d])) or on() vector(0)) '
2026-04-18 17:47:06 -03:00
success_24h = (
2026-04-19 14:18:41 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ { runs_success_selector } }} [24h])) or on() vector(0)) '
2026-04-18 17:47:06 -03:00
)
success_30d = (
2026-04-19 14:18:41 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ { runs_success_selector } }} [30d])) or on() vector(0)) '
2026-04-18 17:47:06 -03:00
)
failures_24h = (
2026-04-19 14:18:41 -03:00
f ' (sum(increase(platform_quality_gate_runs_total {{ { runs_failure_selector } }} [24h])) or on() vector(0)) '
2026-04-18 17:47:06 -03:00
)
success_rate_24h = f " 100 * ( { success_24h } ) / clamp_min(( { runs_24h } ), 1) "
success_rate_30d = f " 100 * ( { success_30d } ) / clamp_min(( { runs_30d } ), 1) "
2026-04-22 14:34:40 -03:00
runs_by_suite_24h = f ' sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_selector } }} [24h])) '
success_by_suite_24h = (
f ' sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_success_selector } }} [24h])) '
)
2026-04-19 14:18:41 -03:00
success_rate_by_suite_24h = (
2026-04-22 14:34:40 -03:00
f ' sort_desc(((100 * ( { success_by_suite_24h } ) / clamp_min(( { runs_by_suite_24h } ), 1)) '
f ' and on(suite) (( { runs_by_suite_24h } ) > 0)) '
f ' or on(suite) ((0 * ( { runs_by_suite_24h } )) - 1)) '
2026-04-19 14:18:41 -03:00
)
2026-04-22 14:34:40 -03:00
non_failure = PLATFORM_TEST_NON_FAILURE_STATUS
current_gate_health_by_suite = (
f ' (100 * sum by (suite) (max by (suite, check) (( {{ { checks_selector } ,result=~ " { non_failure } " }} > bool 0))) '
f ' / clamp_min(sum by (suite) (max by (suite, check) (( {{ { checks_selector } }} > bool 0))), 1)) '
f ' or on(suite) ( { selected_suite_zero } ) '
)
2026-05-15 20:00:40 -03:00
success_history_runs = f ' sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_selector } }} [7d])) '
2026-04-20 08:35:05 -03:00
success_history_by_suite = (
2026-05-15 20:00:40 -03:00
f ' (100 * sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_success_selector } }} [7d])) '
2026-05-15 14:26:06 -03:00
f ' / ( { success_history_runs } )) and on(suite) (( { success_history_runs } ) > 0) '
2026-04-20 08:07:30 -03:00
)
2026-05-15 20:00:40 -03:00
daily_success_volume = (
f ' sum(increase(platform_quality_gate_runs_total {{ { runs_success_selector } }} [24h])) or on() vector(0) '
)
daily_failure_volume = (
f ' sum(increase(platform_quality_gate_runs_total {{ { runs_failure_selector } }} [24h])) or on() vector(0) '
)
2026-04-18 17:47:06 -03:00
coverage_by_suite = (
2026-04-19 14:18:41 -03:00
f ' (max by (suite) ( {{ { coverage_metric_selector } }} )) '
f ' or on(suite) (max by (suite) (platform_quality_gate_workspace_line_coverage_percent {{ { workspace_coverage_selector } }} )) '
2026-04-18 17:47:06 -03:00
)
2026-05-15 20:00:40 -03:00
coverage_history_by_suite = (
f ' (max by (suite) (platform_quality_gate_workspace_line_coverage_percent {{ { workspace_coverage_selector } }} )) '
f ' or on(suite) (max by (suite) ( {{ { coverage_metric_selector } }} )) '
)
2026-04-20 13:45:01 -03:00
coverage_with_missing = (
f " ( { coverage_by_suite } ) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_selector } }} [30d])))) - 1) "
)
2026-04-19 14:18:41 -03:00
smell_by_suite = f ' max by (suite) (platform_quality_gate_source_lines_over_500_total {{ { smell_selector } }} ) '
2026-05-11 17:36:13 -03:00
loc_files_by_suite = f ' max by (suite) (platform_quality_gate_source_files_total {{ { smell_selector } }} ) '
2026-04-20 13:45:01 -03:00
smell_with_missing = (
f " ( { smell_by_suite } ) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_selector } }} [30d])))) - 1) "
)
2026-05-11 17:36:13 -03:00
loc_limit_compliance_by_suite = (
f " (100 * clamp_min(( { loc_files_by_suite } ) - ( { smell_by_suite } ), 0) / ( { loc_files_by_suite } )) "
f " and on(suite) (( { loc_files_by_suite } ) > 0) "
)
loc_limit_compliance_with_missing = (
f " ( { loc_limit_compliance_by_suite } ) "
f " or on(suite) (100 * (1 - clamp_max(( { smell_by_suite } ), 1))) "
f " or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total {{ { runs_selector } }} [30d])))) - 1) "
)
loc_limit_compliance_history = (
2026-05-15 20:00:40 -03:00
f " ( { loc_limit_compliance_by_suite } ) "
f " or on(suite) (100 * (1 - clamp_max(( { smell_by_suite } ), 1))) "
2026-05-15 14:26:06 -03:00
)
2026-04-18 17:47:06 -03:00
average_coverage = f " (avg(( { coverage_by_suite } )) or on() vector(0)) "
suites_loc_violating = f ' (sum((( { smell_by_suite } ) > bool 0)) or on() vector(0)) '
2026-04-19 14:18:41 -03:00
2026-04-20 08:07:30 -03:00
check_regex_tests = " tests|unit|build "
check_regex_coverage = " coverage "
check_regex_loc = " loc|smell "
check_regex_style = " docs|naming|hygiene|lint|docs_naming|style "
check_regex_gate_glue = " gate|glue|gate_glue "
check_regex_sonarqube = " sonarqube|sonar "
check_regex_supply_chain = " ironbank|supply_chain|image_compliance|artifact_security "
2026-05-15 19:52:46 -03:00
def _check_state_percent_series ( regex : str , failed : bool ) - > str :
state = f ' result!~ " { non_failure } " ' if failed else f ' result=~ " { non_failure } " '
state_checks = (
f ' sum by (suite) (max by (suite, check) (( {{ { checks_selector } ,check=~ " { regex } " , { state } }} > bool 0))) '
)
total_checks = (
f ' sum by (suite) (max by (suite, check) (( {{ { checks_selector } ,check=~ " { regex } " }} > bool 0))) '
)
2026-05-15 22:07:41 -03:00
state_percent = f " (100 * ( { state_checks } ) / clamp_min(( { total_checks } ), 1)) "
2026-05-16 02:21:05 -03:00
return f " (( { state_percent } ) or on(suite) ( { selected_suite_zero } )) "
2026-04-19 14:18:41 -03:00
2026-05-15 19:52:46 -03:00
rollup_failed_tests = (
2026-05-15 21:05:13 -03:00
f ' sum by (suite, test) (platform_quality:test_case_status:count_1h {{ suite=~ " { suite_var } " ,branch!= " " ,branch=~ " { branch_var } " ,test!= " " ,test!= " __no_test_cases__ " ,status= " failed " }} ) '
2026-05-15 19:52:46 -03:00
)
raw_failed_tests = (
2026-05-15 21:05:13 -03:00
f ' sum by (suite, test) (max_over_time(platform_quality_gate_test_case_result {{ suite=~ " { suite_var } " ,branch!= " " ,branch=~ " { branch_var } " ,test!= " " ,test!= " __no_test_cases__ " , { exported } ,status= " failed " }} [$__interval])) '
2026-04-20 08:35:05 -03:00
)
2026-05-15 19:52:46 -03:00
problematic_tests_history_core = f " topk(12, (( { rollup_failed_tests } ) or on(suite, test) ( { raw_failed_tests } ))) "
2026-05-15 21:05:13 -03:00
problematic_tests_history = problematic_tests_history_core
2026-05-15 19:52:46 -03:00
rollup_failed_tests_30d = (
2026-05-15 21:05:13 -03:00
f ' sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h {{ suite=~ " { suite_var } " ,branch!= " " ,branch=~ " { branch_var } " ,test!= " " ,test!= " __no_test_cases__ " ,status= " failed " }} [30d:1h])) '
2026-05-15 19:52:46 -03:00
)
raw_failed_tests_30d = (
2026-05-15 21:05:13 -03:00
f ' sum by (suite, test) (increase(platform_quality_gate_test_case_result {{ suite=~ " { suite_var } " ,branch!= " " ,branch=~ " { branch_var } " ,test!= " " ,test!= " __no_test_cases__ " , { exported } ,status= " failed " }} [30d])) '
2026-05-15 19:52:46 -03:00
)
2026-04-20 08:37:26 -03:00
worst_test_per_suite_core = (
2026-05-15 19:52:46 -03:00
f " topk by (suite) (1, (( { rollup_failed_tests_30d } ) or on(suite, test) ( { raw_failed_tests_30d } ))) "
2026-04-20 08:35:05 -03:00
)
2026-05-15 21:05:13 -03:00
worst_test_per_suite = worst_test_per_suite_core
2026-05-15 19:52:46 -03:00
2026-05-15 21:05:13 -03:00
def _selected_status_volume ( status : str ) - > str :
return (
f ' (sum(platform_quality:test_case_status:count_1h {{ suite=~ " { suite_var } " ,branch!= " " , '
f ' branch=~ " { branch_var } " ,test!= " " ,test=~ " { test_var } " ,test!= " __no_test_cases__ " , '
f ' status= " { status } " }} ) or on() vector(0)) '
2026-05-15 19:52:46 -03:00
)
2026-04-20 08:35:05 -03:00
selected_test_pass_fail = [
{
" refId " : " A " ,
2026-05-15 21:05:13 -03:00
" expr " : _selected_status_volume ( " passed " ) ,
" legendFormat " : " Passed " ,
2026-04-20 08:35:05 -03:00
} ,
{
" refId " : " B " ,
2026-05-15 21:05:13 -03:00
" expr " : _selected_status_volume ( " failed " ) ,
" legendFormat " : " Failed " ,
2026-04-20 08:35:05 -03:00
} ,
{
" refId " : " C " ,
2026-05-15 21:05:13 -03:00
" expr " : _selected_status_volume ( " skipped " ) ,
" legendFormat " : " Skipped " ,
2026-04-20 08:35:05 -03:00
} ,
]
2026-04-22 12:42:33 -03:00
selected_test_pass_rate = (
2026-05-15 21:05:13 -03:00
f ' avg by (suite) (platform_quality:test_case_pass_rate:percent_1h {{ suite=~ " { suite_var } " , '
f ' branch!= " " ,branch=~ " { branch_var } " ,test!= " " ,test=~ " { test_var } " ,test!= " __no_test_cases__ " }} ) '
2026-04-22 12:42:33 -03:00
)
2026-04-21 09:35:43 -03:00
recent_branch_evidence = (
f ' sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info {{ { build_info_selector } }} [30d]))) '
)
non_primary_branch_evidence = (
2026-05-15 22:07:41 -03:00
f ' count by (suite) (max_over_time(platform_quality_gate_build_info {{ { build_info_selector } ,branch!~ " main|master|origin/main|origin/master|unknown " }} [30d])) '
)
branch_evidence_by_suite = (
f ' count by (suite) (max_over_time(platform_quality_gate_build_info {{ { build_info_selector } }} [30d])) '
)
primary_branch_clean_by_suite = (
f ' sort_desc((100 * ((( { branch_evidence_by_suite } ) > bool 0) '
f ' unless on(suite) (( { non_primary_branch_evidence } ) > bool 0))) '
f ' or on(suite) (0 * (( { branch_evidence_by_suite } ) > bool 0))) '
2026-04-21 09:35:43 -03:00
)
2026-04-20 08:35:05 -03:00
2026-04-21 11:46:15 -03:00
def _missing_suite_series ( presence_expr : str ) - > str :
missing = f " (( { suite_universe } ) unless on(suite) { presence_expr } ) "
return f " ( { missing } ) or on(suite) (0 * ( { suite_universe } )) "
2026-05-15 14:26:06 -03:00
def _present_suite_percent ( presence_expr : str ) - > str :
present = f " (( { suite_universe } ) and on(suite) { presence_expr } ) "
return f " (100 * ( { present } )) or on(suite) (0 * ( { suite_universe } )) "
present_tests_by_suite = _present_suite_percent (
2026-04-21 11:46:15 -03:00
f ' count by (suite) ( {{ __name__=~ " .*_quality_gate_tests_total " , { exported } }} ) '
2026-04-19 14:18:41 -03:00
)
2026-05-15 14:26:06 -03:00
present_checks_by_suite = _present_suite_percent (
2026-04-21 11:46:15 -03:00
f ' count by (suite) ( {{ __name__=~ " .*_quality_gate_checks_total " , { exported } }} ) '
2026-04-19 14:18:41 -03:00
)
2026-05-15 14:26:06 -03:00
present_coverage_by_suite = _present_suite_percent (
2026-04-21 11:46:15 -03:00
f " count by (suite) (platform_quality_gate_workspace_line_coverage_percent {{ { exported } }} ) "
2026-04-19 14:18:41 -03:00
)
2026-05-15 14:26:06 -03:00
present_loc_by_suite = _present_suite_percent (
2026-05-11 17:36:13 -03:00
f " count by (suite) (platform_quality_gate_source_lines_over_500_total {{ { exported } }} ) "
f " and on(suite) count by (suite) (platform_quality_gate_source_files_total {{ { exported } }} ) "
2026-04-18 17:47:06 -03:00
)
2026-05-15 14:26:06 -03:00
present_test_case_by_suite = _present_suite_percent (
2026-04-21 11:46:15 -03:00
f " count by (suite) (platform_quality_gate_test_case_result {{ { exported } }} ) "
2026-04-20 13:45:01 -03:00
)
2026-05-15 14:26:06 -03:00
real_test_case_by_suite = _present_suite_percent (
2026-04-22 12:42:33 -03:00
f ' count by (suite) (platform_quality_gate_test_case_result {{ { exported } ,test!= " __no_test_cases__ " }} ) '
)
2026-04-18 17:47:06 -03:00
2026-05-15 14:26:06 -03:00
dark_red = " dark-red "
dark_orange = " dark-orange "
dark_yellow = " dark-yellow "
dark_green = " dark-green "
dark_blue = " dark-blue "
2026-04-19 14:18:41 -03:00
success_thresholds = {
2026-04-12 20:05:39 -03:00
" mode " : " absolute " ,
" steps " : [
2026-05-15 14:26:06 -03:00
{ " color " : dark_red , " value " : None } ,
{ " color " : dark_orange , " value " : 90 } ,
{ " color " : dark_yellow , " value " : 93 } ,
{ " color " : dark_green , " value " : 95 } ,
{ " color " : dark_blue , " value " : 100 } ,
2026-04-12 20:05:39 -03:00
] ,
}
2026-04-22 12:42:33 -03:00
coverage_thresholds = success_thresholds
2026-04-12 20:05:39 -03:00
failures_thresholds = {
" mode " : " absolute " ,
" steps " : [
2026-05-15 22:43:44 -03:00
{ " color " : dark_blue , " value " : None } ,
{ " color " : dark_green , " value " : 0.01 } ,
2026-05-15 14:26:06 -03:00
{ " color " : dark_yellow , " value " : 1 } ,
{ " color " : dark_orange , " value " : 3 } ,
{ " color " : dark_red , " value " : 5 } ,
2026-04-18 17:47:06 -03:00
] ,
}
2026-04-12 22:58:21 -03:00
smell_thresholds = {
" mode " : " absolute " ,
" steps " : [
2026-05-15 14:26:06 -03:00
{ " color " : dark_red , " value " : None } ,
{ " color " : dark_green , " value " : 0 } ,
{ " color " : dark_yellow , " value " : 1 } ,
{ " color " : dark_orange , " value " : 3 } ,
{ " color " : dark_red , " value " : 5 } ,
2026-04-12 22:58:21 -03:00
] ,
}
2026-04-19 14:18:41 -03:00
missing_thresholds = {
" mode " : " absolute " ,
" steps " : [
2026-05-15 14:26:06 -03:00
{ " color " : dark_green , " value " : None } ,
{ " color " : dark_red , " value " : 1 } ,
2026-04-19 14:18:41 -03:00
] ,
}
2026-04-12 20:05:39 -03:00
2026-04-18 17:47:06 -03:00
panels . append (
stat_panel (
2 ,
2026-04-22 14:34:40 -03:00
" Run Reliability (24h) " ,
2026-04-18 17:47:06 -03:00
success_rate_24h ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 4 , " x " : 0 , " y " : 0 } ,
2026-04-18 17:47:06 -03:00
unit = " percent " ,
decimals = 2 ,
instant = True ,
2026-04-19 14:18:41 -03:00
thresholds = success_thresholds ,
2026-04-18 17:47:06 -03:00
)
2026-03-31 14:51:49 -03:00
)
2026-04-18 17:47:06 -03:00
panels . append (
stat_panel (
3 ,
2026-04-22 14:34:40 -03:00
" Run Reliability (30d) " ,
2026-04-18 17:47:06 -03:00
success_rate_30d ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 4 , " x " : 4 , " y " : 0 } ,
2026-04-18 17:47:06 -03:00
unit = " percent " ,
decimals = 2 ,
instant = True ,
2026-04-19 14:18:41 -03:00
thresholds = success_thresholds ,
2026-04-18 17:47:06 -03:00
)
2026-03-31 14:51:49 -03:00
)
2026-04-12 20:05:39 -03:00
panels . append (
2026-04-18 17:47:06 -03:00
stat_panel (
2026-04-12 20:05:39 -03:00
4 ,
2026-04-22 14:34:40 -03:00
" Failed Runs (24h) " ,
2026-04-18 17:47:06 -03:00
failures_24h ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 4 , " x " : 8 , " y " : 0 } ,
2026-04-18 17:47:06 -03:00
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
panels . append (
stat_panel (
5 ,
" Runs (24h) " ,
runs_24h ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 4 , " x " : 12 , " y " : 0 } ,
2026-04-18 17:47:06 -03:00
unit = " none " ,
instant = True ,
thresholds = {
" mode " : " absolute " ,
2026-05-15 14:26:06 -03:00
" steps " : [ { " color " : dark_red , " value " : None } , { " color " : dark_green , " value " : 1 } ] ,
2026-04-18 17:47:06 -03:00
} ,
)
)
panels . append (
stat_panel (
6 ,
" Avg Coverage ( % ) " ,
average_coverage ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 4 , " x " : 16 , " y " : 0 } ,
2026-04-18 17:47:06 -03:00
unit = " percent " ,
decimals = 2 ,
instant = True ,
2026-04-19 14:18:41 -03:00
thresholds = success_thresholds ,
2026-04-18 17:47:06 -03:00
)
)
panels . append (
stat_panel (
7 ,
" Suites with LOC >500 " ,
suites_loc_violating ,
2026-04-19 14:18:41 -03:00
{ " h " : 5 , " w " : 4 , " x " : 20 , " y " : 0 } ,
2026-04-18 17:47:06 -03:00
unit = " none " ,
instant = True ,
thresholds = smell_thresholds ,
)
)
2026-04-19 14:18:41 -03:00
2026-04-18 17:47:06 -03:00
panels . append (
bargauge_panel (
8 ,
2026-04-22 14:34:40 -03:00
" Current Gate Health by Suite " ,
current_gate_health_by_suite ,
2026-04-22 12:42:33 -03:00
{ " h " : 8 , " w " : 8 , " x " : 0 , " y " : 5 } ,
2026-04-12 20:05:39 -03:00
unit = " percent " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " asc " ,
2026-04-19 14:18:41 -03:00
thresholds = success_thresholds ,
2026-04-12 20:05:39 -03:00
decimals = 2 ,
)
2026-03-31 14:51:49 -03:00
)
2026-04-22 14:34:40 -03:00
panels [ - 1 ] [ " description " ] = (
" Current pass percentage across the required gate dimensions reported by each suite. "
" This is the fastest place to answer whether the latest suite quality signal is healthy. "
)
reliability_suite_panel = bargauge_panel (
9 ,
" Run Reliability by Suite (24h) " ,
success_rate_by_suite_24h ,
{ " h " : 8 , " w " : 8 , " x " : 8 , " y " : 5 } ,
unit = " percent " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " asc " ,
thresholds = success_thresholds ,
decimals = 2 ,
)
reliability_suite_panel [ " description " ] = (
" Rolling CI run success rate. This can stay low after failed/debug runs even when "
" Current Gate Health is green. "
)
reliability_suite_panel [ " fieldConfig " ] [ " defaults " ] [ " mappings " ] = [
{ " type " : " value " , " options " : { " -1 " : { " text " : " no runs " } } }
]
panels . append ( reliability_suite_panel )
2026-05-15 22:07:41 -03:00
history_panel = state_timeline_panel (
2026-04-18 17:47:06 -03:00
11 ,
2026-05-15 20:00:40 -03:00
" Run Reliability by Suite (7d rolling) " ,
2026-04-18 17:47:06 -03:00
success_history_by_suite ,
2026-04-22 12:42:33 -03:00
{ " h " : 8 , " w " : 24 , " x " : 0 , " y " : 13 } ,
2026-05-15 20:00:40 -03:00
thresholds = success_thresholds ,
description = (
" Seven-day rolling run success rate per suite. Each suite gets its own lane, "
" so brief failed/debug runs lower the lane color without creating unreadable 0/100 spikes. "
) ,
)
panels . append ( history_panel )
run_volume_panel = timeseries_panel (
12 ,
" Daily Run Volume (Selected Scope) " ,
None ,
{ " h " : 8 , " w " : 8 , " x " : 0 , " y " : 21 } ,
unit = " none " ,
targets = [
{ " refId " : " A " , " expr " : daily_success_volume , " legendFormat " : " Success " } ,
{ " refId " : " B " , " expr " : daily_failure_volume , " legendFormat " : " Failure " } ,
] ,
2026-04-08 23:33:17 -03:00
legend_display = " list " ,
legend_placement = " bottom " ,
2026-05-15 20:00:40 -03:00
legend_calcs = [ ] ,
)
run_volume_panel [ " description " ] = (
" Twenty-four-hour rolling run counts for the selected suite/branch scope. "
" This is volume, not a pass-rate percentage. "
)
run_volume_panel [ " fieldConfig " ] [ " defaults " ] [ " min " ] = 0
run_volume_panel [ " fieldConfig " ] [ " defaults " ] [ " custom " ] = {
" drawStyle " : " bars " ,
" barAlignment " : 0 ,
" lineWidth " : 0 ,
" fillOpacity " : 70 ,
" stacking " : { " mode " : " normal " , " group " : " A " } ,
2026-04-09 16:35:14 -03:00
}
2026-05-15 20:00:40 -03:00
panels . append ( run_volume_panel )
2026-04-19 14:18:41 -03:00
2026-04-18 17:47:06 -03:00
panels . append (
2026-05-15 22:07:41 -03:00
state_timeline_panel (
2026-04-18 17:47:06 -03:00
13 ,
2026-05-15 20:00:40 -03:00
" Coverage History by Suite " ,
coverage_history_by_suite ,
2026-04-22 12:42:33 -03:00
{ " h " : 8 , " w " : 8 , " x " : 8 , " y " : 21 } ,
2026-05-15 20:00:40 -03:00
thresholds = coverage_thresholds ,
description = (
" Latest reported line coverage per suite over time. Coverage is separate "
" from LOC compliance so one signal cannot hide the other. "
) ,
2026-04-18 17:47:06 -03:00
)
)
2026-05-15 20:00:40 -03:00
panels . append (
2026-05-15 22:07:41 -03:00
state_timeline_panel (
2026-05-15 20:00:40 -03:00
14 ,
" Files <=500 LOC History by Suite " ,
loc_limit_compliance_history ,
{ " h " : 8 , " w " : 8 , " x " : 16 , " y " : 21 } ,
thresholds = success_thresholds ,
description = (
" Percent of LOC-gated source files at or under the 500-line limit. "
" This uses the existing file-count telemetry; longest-file history needs a new publisher metric. "
) ,
)
2026-04-19 14:18:41 -03:00
)
2026-04-22 12:42:33 -03:00
check_dimensions = [
( " Tests " , check_regex_tests ) ,
( " Coverage " , check_regex_coverage ) ,
( " LOC " , check_regex_loc ) ,
( " Style " , check_regex_style ) ,
( " Gate Glue " , check_regex_gate_glue ) ,
( " SonarQube " , check_regex_sonarqube ) ,
( " Supply Chain " , check_regex_supply_chain ) ,
]
def _append_check_trends ( start_id : int , title_prefix : str , failed : bool , y : int ) - > None :
2026-05-15 19:52:46 -03:00
trend_thresholds = failures_thresholds if failed else success_thresholds
trend_description = (
" Current bad-state percentage for this check family, evaluated over time. "
" Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart. "
if failed
else " Current acceptable-state percentage for this check family, evaluated over time. "
" Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence. "
)
2026-04-22 12:42:33 -03:00
for index , ( label , regex ) in enumerate ( check_dimensions [ : 4 ] ) :
2026-05-15 22:07:41 -03:00
panel = state_timeline_panel (
2026-04-22 12:42:33 -03:00
start_id + index ,
2026-05-15 19:52:46 -03:00
f " { label } { title_prefix } " ,
_check_state_percent_series ( regex , failed ) ,
2026-04-22 12:42:33 -03:00
{ " h " : 7 , " w " : 6 , " x " : index * 6 , " y " : y } ,
2026-05-15 21:05:13 -03:00
thresholds = trend_thresholds ,
description = trend_description ,
2026-04-22 12:42:33 -03:00
)
panels . append ( panel )
for index , ( label , regex ) in enumerate ( check_dimensions [ 4 : ] ) :
2026-05-15 22:07:41 -03:00
panel = state_timeline_panel (
2026-04-22 12:42:33 -03:00
start_id + 4 + index ,
2026-05-15 19:52:46 -03:00
f " { label } { title_prefix } " ,
_check_state_percent_series ( regex , failed ) ,
2026-04-22 12:42:33 -03:00
{ " h " : 7 , " w " : 8 , " x " : index * 8 , " y " : y + 7 } ,
2026-05-15 21:05:13 -03:00
thresholds = trend_thresholds ,
description = trend_description ,
2026-04-22 12:42:33 -03:00
)
panels . append ( panel )
2026-05-15 19:52:46 -03:00
_append_check_trends ( 130 , " Failure Rate " , True , 29 )
_append_check_trends ( 138 , " Healthy Rate " , False , 43 )
2026-04-20 08:07:30 -03:00
panels . append (
2026-05-15 22:07:41 -03:00
state_timeline_panel (
2026-04-22 12:42:33 -03:00
145 ,
" Problematic Tests Over Time (Top failures) " ,
problematic_tests_history ,
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 57 } ,
2026-05-15 21:05:13 -03:00
thresholds = failures_thresholds ,
2026-04-20 08:07:30 -03:00
unit = " none " ,
2026-05-15 21:05:13 -03:00
min_value = 0 ,
max_value = None ,
2026-05-15 14:26:06 -03:00
legend = " {{ suite}} - {{ test}} " ,
2026-05-15 21:05:13 -03:00
description = (
" Top failing test cases over time, using memoized hourly rollups. "
" Blank branch/test labels and placeholder no-test-case rows are excluded. "
) ,
2026-04-20 08:07:30 -03:00
)
)
2026-05-15 21:05:13 -03:00
panels [ - 1 ] [ " links " ] = jenkins_suite_links ( )
panels [ - 1 ] [ " fieldConfig " ] [ " defaults " ] [ " links " ] = jenkins_latest_artifact_data_links ( )
2026-04-18 17:47:06 -03:00
panels . append (
2026-04-19 14:18:41 -03:00
bargauge_panel (
2026-04-22 12:42:33 -03:00
147 ,
" Most Problematic Test by Suite (30d) " ,
worst_test_per_suite ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 57 } ,
2026-04-18 17:47:06 -03:00
unit = " none " ,
instant = True ,
2026-04-20 08:35:05 -03:00
legend = " {{ suite}} · {{ test}} " ,
2026-04-22 12:42:33 -03:00
sort_order = " desc " ,
thresholds = failures_thresholds ,
limit = 9 ,
2026-04-20 13:45:01 -03:00
links = jenkins_suite_links ( ) ,
2026-04-21 11:39:13 -03:00
data_links = jenkins_latest_artifact_data_links ( ) ,
2026-04-20 08:35:05 -03:00
)
)
panels . append (
timeseries_panel (
146 ,
" Selected Test Pass/Fail History " ,
None ,
2026-04-22 12:42:33 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 65 } ,
2026-04-20 08:35:05 -03:00
unit = " none " ,
targets = selected_test_pass_fail ,
legend_display = " list " ,
legend_placement = " bottom " ,
2026-05-15 14:26:06 -03:00
legend_calcs = [ ] ,
2026-04-20 13:45:01 -03:00
links = jenkins_suite_links ( ) ,
2026-04-21 11:39:13 -03:00
data_links = jenkins_artifact_data_links ( ) ,
2026-04-20 08:35:05 -03:00
)
)
2026-05-15 21:05:13 -03:00
panels [ - 1 ] [ " description " ] = (
" Stacked hourly outcome volume for the selected suite/branch/test scope. "
" This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans. "
)
panels [ - 1 ] [ " fieldConfig " ] [ " defaults " ] [ " min " ] = 0
panels [ - 1 ] [ " fieldConfig " ] [ " defaults " ] [ " custom " ] = {
" drawStyle " : " bars " ,
" barAlignment " : 0 ,
" lineWidth " : 0 ,
" fillOpacity " : 70 ,
" stacking " : { " mode " : " normal " , " group " : " A " } ,
}
2026-05-15 22:07:41 -03:00
selected_pass_rate_panel = state_timeline_panel (
2026-04-22 12:42:33 -03:00
152 ,
" Selected Test Pass Rate History " ,
selected_test_pass_rate ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 65 } ,
2026-05-15 21:05:13 -03:00
thresholds = success_thresholds ,
2026-05-15 14:26:06 -03:00
legend = " {{ suite}} " ,
2026-05-15 21:05:13 -03:00
description = (
" Average pass rate per suite for the selected test filter, using memoized hourly "
" test-case pass-rate rollups instead of raw historical scans. "
) ,
2026-04-20 08:35:05 -03:00
)
2026-05-15 21:05:13 -03:00
selected_pass_rate_panel [ " links " ] = jenkins_suite_links ( )
selected_pass_rate_panel [ " fieldConfig " ] [ " defaults " ] [ " links " ] = jenkins_artifact_data_links ( )
2026-04-22 12:42:33 -03:00
panels . append ( selected_pass_rate_panel )
2026-04-18 17:47:06 -03:00
2026-04-12 22:58:21 -03:00
coverage_panel = bargauge_panel (
2026-04-18 17:47:06 -03:00
17 ,
" Coverage by Suite (Latest, gate 95) " ,
coverage_with_missing ,
2026-04-22 12:42:33 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 73 } ,
2026-04-12 22:58:21 -03:00
unit = " percent " ,
instant = True ,
legend = " {{ suite}} " ,
sort_order = " asc " ,
2026-04-22 12:42:33 -03:00
thresholds = coverage_thresholds ,
2026-04-12 22:58:21 -03:00
decimals = 2 ,
)
coverage_panel [ " fieldConfig " ] [ " defaults " ] [ " mappings " ] = [
2026-04-18 17:47:06 -03:00
{ " type " : " value " , " options " : { " -1 " : { " text " : " missing " } } }
2026-04-12 22:58:21 -03:00
]
panels . append ( coverage_panel )
2026-04-19 14:18:41 -03:00
2026-04-12 22:58:21 -03:00
smell_panel = bargauge_panel (
2026-04-18 17:47:06 -03:00
18 ,
2026-05-11 17:36:13 -03:00
" Files <=500 LOC by Suite (Latest) " ,
loc_limit_compliance_with_missing ,
2026-04-22 12:42:33 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 73 } ,
2026-05-11 17:36:13 -03:00
unit = " percent " ,
2026-04-12 22:58:21 -03:00
instant = True ,
legend = " {{ suite}} " ,
2026-05-11 17:36:13 -03:00
sort_order = " asc " ,
thresholds = success_thresholds ,
decimals = 0 ,
2026-04-12 22:58:21 -03:00
)
smell_panel [ " fieldConfig " ] [ " defaults " ] [ " mappings " ] = [
2026-04-18 17:47:06 -03:00
{ " type " : " value " , " options " : { " -1 " : { " text " : " missing " } } }
2026-04-12 22:58:21 -03:00
]
2026-05-11 17:36:13 -03:00
smell_panel [ " description " ] = " Percent of managed LOC-gated files at or under 500 lines. Older suite payloads fall back to 100 % /0 % u ntil they emit platform_quality_gate_source_files_total. "
2026-04-12 22:58:21 -03:00
panels . append ( smell_panel )
2026-01-18 02:50:07 -03:00
2026-04-19 14:18:41 -03:00
panels . append (
bargauge_panel (
27 ,
2026-05-15 14:26:06 -03:00
" Tests Metrics Present by Suite " ,
present_tests_by_suite ,
2026-04-22 12:42:33 -03:00
{ " h " : 7 , " w " : 6 , " x " : 0 , " y " : 81 } ,
2026-05-15 14:26:06 -03:00
unit = " percent " ,
2026-04-19 14:18:41 -03:00
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
2026-05-15 14:26:06 -03:00
thresholds = success_thresholds ,
2026-04-19 14:18:41 -03:00
decimals = 0 ,
)
)
panels . append (
bargauge_panel (
28 ,
2026-05-15 14:26:06 -03:00
" Checks Metrics Present by Suite " ,
present_checks_by_suite ,
2026-04-22 12:42:33 -03:00
{ " h " : 7 , " w " : 6 , " x " : 6 , " y " : 81 } ,
2026-05-15 14:26:06 -03:00
unit = " percent " ,
2026-04-19 14:18:41 -03:00
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
2026-05-15 14:26:06 -03:00
thresholds = success_thresholds ,
2026-04-19 14:18:41 -03:00
decimals = 0 ,
)
)
panels . append (
bargauge_panel (
29 ,
2026-05-15 14:26:06 -03:00
" Coverage Metrics Present by Suite " ,
present_coverage_by_suite ,
2026-04-22 12:42:33 -03:00
{ " h " : 7 , " w " : 6 , " x " : 12 , " y " : 81 } ,
2026-05-15 14:26:06 -03:00
unit = " percent " ,
2026-04-19 14:18:41 -03:00
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
2026-05-15 14:26:06 -03:00
thresholds = success_thresholds ,
2026-04-19 14:18:41 -03:00
decimals = 0 ,
)
)
panels . append (
bargauge_panel (
30 ,
2026-05-15 14:26:06 -03:00
" LOC Compliance Metrics Present by Suite " ,
present_loc_by_suite ,
2026-04-22 12:42:33 -03:00
{ " h " : 7 , " w " : 6 , " x " : 18 , " y " : 81 } ,
2026-05-15 14:26:06 -03:00
unit = " percent " ,
2026-04-19 14:18:41 -03:00
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
2026-05-15 14:26:06 -03:00
thresholds = success_thresholds ,
2026-04-19 14:18:41 -03:00
decimals = 0 ,
)
)
panels . append (
stat_panel (
31 ,
" SonarQube API Up " ,
" (max(sonarqube_up) or on() vector(0)) " ,
2026-04-22 12:42:33 -03:00
{ " h " : 6 , " w " : 4 , " x " : 0 , " y " : 88 } ,
2026-04-19 14:18:41 -03:00
unit = " none " ,
instant = True ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
2026-05-15 14:26:06 -03:00
{ " color " : dark_red , " value " : None } ,
{ " color " : dark_green , " value " : 1 } ,
2026-04-19 14:18:41 -03:00
] ,
} ,
)
)
panels . append (
stat_panel (
32 ,
" Sonar Projects (Selected) " ,
2026-05-15 21:05:13 -03:00
f ' (count(max by (project_key) (sonarqube_project_quality_gate_pass {{ project_key=~ " { suite_var } " }} )) or on() vector(0)) ' ,
2026-04-22 12:42:33 -03:00
{ " h " : 6 , " w " : 4 , " x " : 4 , " y " : 88 } ,
2026-04-19 14:18:41 -03:00
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
panels . append (
stat_panel (
33 ,
" Sonar Gate Fetch Errors " ,
" (max(sonarqube_quality_gate_fetch_errors_total) or on() vector(0)) " ,
2026-04-22 12:42:33 -03:00
{ " h " : 6 , " w " : 4 , " x " : 8 , " y " : 88 } ,
2026-04-19 14:18:41 -03:00
unit = " none " ,
instant = True ,
thresholds = failures_thresholds ,
)
)
sonar_status_mix_panel = pie_panel (
34 ,
" Sonar Gate Status Mix (Selected) " ,
2026-05-15 21:05:13 -03:00
f ' count by (status) (max by (project_key, status) (sonarqube_project_quality_gate_pass {{ project_key=~ " { suite_var } " }} )) ' ,
{ " h " : 6 , " w " : 4 , " x " : 12 , " y " : 88 } ,
2026-04-19 14:18:41 -03:00
)
sonar_status_mix_panel [ " targets " ] [ 0 ] [ " legendFormat " ] = " {{ status}} "
panels . append ( sonar_status_mix_panel )
panels . append (
2026-05-15 22:07:41 -03:00
state_timeline_panel (
2026-04-19 14:18:41 -03:00
35 ,
2026-05-15 22:07:41 -03:00
" Sonar Gate Health by Project " ,
f ' 100 * max by (project_key) (sonarqube_project_quality_gate_pass {{ project_key=~ " { suite_var } " }} ) ' ,
2026-05-15 21:05:13 -03:00
{ " h " : 6 , " w " : 8 , " x " : 16 , " y " : 88 } ,
2026-05-15 22:07:41 -03:00
thresholds = success_thresholds ,
unit = " percent " ,
2026-05-15 21:05:13 -03:00
min_value = 0 ,
2026-05-15 22:07:41 -03:00
max_value = 100 ,
2026-04-19 14:18:41 -03:00
legend = " {{ project_key}} " ,
2026-05-15 21:05:13 -03:00
description = (
2026-05-15 22:07:41 -03:00
" SonarQube gate status over time by project. OK projects render as full healthy lanes; "
" non-OK projects drop to red without disappearing. "
2026-05-15 21:05:13 -03:00
) ,
2026-04-19 14:18:41 -03:00
)
)
2026-04-20 13:45:01 -03:00
panels . append (
bargauge_panel (
148 ,
2026-05-15 14:26:06 -03:00
" Test-Case Metrics Present by Suite " ,
present_test_case_by_suite ,
2026-04-22 12:42:33 -03:00
{ " h " : 6 , " w " : 12 , " x " : 0 , " y " : 94 } ,
2026-05-15 14:26:06 -03:00
unit = " percent " ,
2026-04-22 12:42:33 -03:00
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
2026-05-15 14:26:06 -03:00
thresholds = success_thresholds ,
2026-04-22 12:42:33 -03:00
decimals = 0 ,
)
)
panels . append (
bargauge_panel (
151 ,
2026-05-15 14:26:06 -03:00
" Real Test Cases Present by Suite " ,
real_test_case_by_suite ,
2026-04-22 12:42:33 -03:00
{ " h " : 6 , " w " : 12 , " x " : 12 , " y " : 94 } ,
2026-05-15 14:26:06 -03:00
unit = " percent " ,
2026-04-20 13:45:01 -03:00
instant = True ,
legend = " {{ suite}} " ,
sort_order = " desc " ,
2026-05-15 14:26:06 -03:00
thresholds = success_thresholds ,
2026-04-20 13:45:01 -03:00
decimals = 0 ,
)
)
2026-04-21 09:35:43 -03:00
panels . append (
bargauge_panel (
149 ,
" Recent Branch Evidence by Suite (30d) " ,
recent_branch_evidence ,
2026-04-22 12:42:33 -03:00
{ " h " : 7 , " w " : 12 , " x " : 0 , " y " : 100 } ,
2026-04-21 09:35:43 -03:00
unit = " none " ,
instant = True ,
legend = " {{ suite}} · {{ branch}} " ,
sort_order = " desc " ,
thresholds = missing_thresholds ,
decimals = 0 ,
links = jenkins_suite_links ( ) ,
)
)
panels . append (
bargauge_panel (
150 ,
2026-05-15 22:07:41 -03:00
" Primary Branch Clean by Suite (30d) " ,
primary_branch_clean_by_suite ,
2026-04-22 12:42:33 -03:00
{ " h " : 7 , " w " : 12 , " x " : 12 , " y " : 100 } ,
2026-05-15 22:07:41 -03:00
unit = " percent " ,
2026-04-21 09:35:43 -03:00
instant = True ,
2026-05-15 22:07:41 -03:00
legend = " {{ suite}} " ,
2026-04-21 09:35:43 -03:00
sort_order = " desc " ,
2026-05-15 22:07:41 -03:00
thresholds = success_thresholds ,
2026-04-21 09:35:43 -03:00
decimals = 0 ,
links = jenkins_suite_links ( ) ,
)
)
2026-04-19 14:18:41 -03:00
2026-04-22 16:56:52 -03:00
# Keep the first paint intentionally light. The detailed matrices remain
# available, but they stay collapsed so browsers do not render every series
# and legend before the operator asks for them.
panel_by_id = { panel [ " id " ] : panel for panel in panels }
visible_layout = {
2 : { " h " : 4 , " w " : 4 , " x " : 0 , " y " : 0 } ,
3 : { " h " : 4 , " w " : 4 , " x " : 4 , " y " : 0 } ,
4 : { " h " : 4 , " w " : 4 , " x " : 8 , " y " : 0 } ,
5 : { " h " : 4 , " w " : 4 , " x " : 12 , " y " : 0 } ,
6 : { " h " : 4 , " w " : 4 , " x " : 16 , " y " : 0 } ,
7 : { " h " : 4 , " w " : 4 , " x " : 20 , " y " : 0 } ,
2026-05-15 14:26:06 -03:00
8 : { " h " : 7 , " w " : 12 , " x " : 0 , " y " : 4 } ,
9 : { " h " : 7 , " w " : 12 , " x " : 12 , " y " : 4 } ,
2026-04-22 16:56:52 -03:00
17 : { " h " : 7 , " w " : 12 , " x " : 0 , " y " : 11 } ,
18 : { " h " : 7 , " w " : 12 , " x " : 12 , " y " : 11 } ,
}
compact_panels = [ ]
for panel_id , grid in visible_layout . items ( ) :
panel = panel_by_id [ panel_id ]
panel [ " gridPos " ] = grid
compact_panels . append ( panel )
def children ( ids ) :
return [ panel_by_id [ panel_id ] for panel_id in ids ]
2026-05-15 14:26:06 -03:00
row_layout = {
11 : { " h " : 8 , " w " : 12 , " x " : 0 , " y " : 19 } ,
12 : { " h " : 8 , " w " : 12 , " x " : 12 , " y " : 19 } ,
13 : { " h " : 8 , " w " : 12 , " x " : 0 , " y " : 27 } ,
14 : { " h " : 8 , " w " : 12 , " x " : 12 , " y " : 27 } ,
2026-05-15 22:07:41 -03:00
145 : { " h " : 8 , " w " : 24 , " x " : 0 , " y " : 74 } ,
147 : { " h " : 8 , " w " : 8 , " x " : 0 , " y " : 83 } ,
146 : { " h " : 8 , " w " : 8 , " x " : 8 , " y " : 83 } ,
152 : { " h " : 8 , " w " : 8 , " x " : 16 , " y " : 83 } ,
2026-05-15 19:52:46 -03:00
27 : { " h " : 7 , " w " : 6 , " x " : 0 , " y " : 94 } ,
28 : { " h " : 7 , " w " : 6 , " x " : 6 , " y " : 94 } ,
29 : { " h " : 7 , " w " : 6 , " x " : 12 , " y " : 94 } ,
30 : { " h " : 7 , " w " : 6 , " x " : 18 , " y " : 94 } ,
148 : { " h " : 7 , " w " : 6 , " x " : 0 , " y " : 101 } ,
151 : { " h " : 7 , " w " : 6 , " x " : 6 , " y " : 101 } ,
149 : { " h " : 7 , " w " : 6 , " x " : 12 , " y " : 101 } ,
150 : { " h " : 7 , " w " : 6 , " x " : 18 , " y " : 101 } ,
2026-05-15 14:26:06 -03:00
31 : { " h " : 6 , " w " : 4 , " x " : 0 , " y " : 111 } ,
32 : { " h " : 6 , " w " : 4 , " x " : 4 , " y " : 111 } ,
33 : { " h " : 6 , " w " : 4 , " x " : 8 , " y " : 111 } ,
2026-05-15 21:05:13 -03:00
34 : { " h " : 6 , " w " : 4 , " x " : 12 , " y " : 111 } ,
35 : { " h " : 6 , " w " : 8 , " x " : 16 , " y " : 111 } ,
2026-05-15 14:26:06 -03:00
}
for panel_id , grid in row_layout . items ( ) :
panel_by_id [ panel_id ] [ " gridPos " ] = grid
2026-04-22 16:56:52 -03:00
compact_panels . extend (
[
row_panel ( 500 , " Reliability And Run History " , 18 , panels = children ( [ 11 , 12 , 13 , 14 ] ) ) ,
row_panel (
501 ,
2026-05-15 19:52:46 -03:00
" Check Failure Rates By Suite " ,
2026-04-22 16:56:52 -03:00
19 ,
panels = children ( [ 130 , 131 , 132 , 133 , 134 , 135 , 136 ] ) ,
) ,
row_panel (
502 ,
2026-05-15 19:52:46 -03:00
" Check Healthy Rates By Suite " ,
2026-04-22 16:56:52 -03:00
20 ,
panels = children ( [ 138 , 139 , 140 , 141 , 142 , 143 , 144 ] ) ,
) ,
row_panel (
503 ,
" Test Drilldowns And Problem Tests " ,
21 ,
panels = children ( [ 145 , 147 , 146 , 152 ] ) ,
) ,
row_panel (
504 ,
2026-05-15 14:26:06 -03:00
" Telemetry Completeness And Branches " ,
2026-04-22 16:56:52 -03:00
22 ,
2026-05-15 14:26:06 -03:00
panels = children ( [ 27 , 28 , 29 , 30 , 148 , 151 , 149 , 150 ] ) ,
) ,
row_panel (
505 ,
" SonarQube Project Health " ,
23 ,
panels = children ( [ 31 , 32 , 33 , 34 , 35 ] ) ,
2026-04-22 16:56:52 -03:00
) ,
]
)
panels = compact_panels
2026-05-11 01:01:46 -03:00
set_bargauge_display_mode ( panels , " basic " )
2026-04-22 16:56:52 -03:00
2026-01-18 02:50:07 -03:00
return {
2026-04-19 14:18:41 -03:00
" uid " : " atlas-jobs " ,
2026-04-12 20:05:39 -03:00
" title " : " Atlas Testing " ,
2026-01-18 02:50:07 -03:00
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
2026-04-12 20:05:39 -03:00
" time " : { " from " : " now-30d " , " to " : " now " } ,
2026-01-18 02:50:07 -03:00
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
2026-04-19 14:18:41 -03:00
" tags " : [ " atlas " , " testing " , " quality-gate " , " ci " ] ,
" templating " : {
" list " : [
testing_suite_variable ( ) ,
2026-04-21 09:35:43 -03:00
testing_branch_variable ( ) ,
2026-04-21 11:39:13 -03:00
testing_case_variable ( ) ,
2026-04-20 13:45:01 -03:00
jenkins_base_variable ( ) ,
2026-04-19 14:18:41 -03:00
]
} ,
2026-01-18 02:50:07 -03:00
}
2026-04-22 02:26:31 -03:00
def build_testing_dashboard ( ) :
dashboard = build_jobs_dashboard ( )
dashboard [ " uid " ] = " atlas-testing "
2026-05-16 02:56:52 -03:00
dashboard [ " folderUid " ] = PUBLIC_DASHBOARD_FOLDER
dashboard [ " editable " ] = False
2026-04-22 02:26:31 -03:00
return dashboard
2026-05-15 19:37:03 -03:00
def build_gitops_dashboard ( ) :
2026-05-15 22:07:41 -03:00
gitops_success_thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " yellow " , " value " : 99 } ,
{ " color " : " blue " , " value " : 100 } ,
] ,
}
2026-05-15 19:37:03 -03:00
gitops_value_overrides = [
{
" matcher " : { " id " : " byName " , " options " : " Value " } ,
" properties " : [
{
" id " : " thresholds " ,
" value " : {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " blue " , " value " : 1 } ,
] ,
} ,
}
] ,
}
]
kustomization_table = (
f " max by (namespace, name, path, source_namespace, source_name, revision, ready, reason) "
f " (ananke_gitops_kustomization_info {{ { GITOPS_SELECTOR } }} ) "
f " * on(namespace, name) group_left() max by (namespace, name) "
f " (ananke_gitops_kustomization_ready {{ { GITOPS_SELECTOR } }} ) "
)
helm_table = (
f " max by (namespace, name, chart, version, app_version, revision, ready, reason) "
f " (ananke_gitops_helmrelease_info {{ { GITOPS_SELECTOR } }} ) "
f " * on(namespace, name) group_left() max by (namespace, name) "
f " (ananke_gitops_helmrelease_ready {{ { GITOPS_SELECTOR } }} ) "
)
source_table = (
f " max by (namespace, name, url, branch, revision, ready, reason) "
f " (ananke_gitops_flux_source_info {{ { GITOPS_SELECTOR } }} ) "
f " * on(namespace, name) group_left() max by (namespace, name) "
f " (ananke_gitops_flux_source_ready {{ { GITOPS_SELECTOR } }} ) "
)
panels = [
stat_panel (
1 ,
" Flux Source " ,
f " { GITOPS_SOURCE_INFO } or on() vector(0) " ,
{ " h " : 4 , " w " : 8 , " x " : 0 , " y " : 0 } ,
text_mode = " name " ,
targets = [ { " expr " : f " { GITOPS_SOURCE_INFO } or on() vector(0) " , " refId " : " A " , " legendFormat " : " {{ branch}} · {{ revision}} " , " instant " : True } ] ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " blue " , " value " : 1 } ,
] ,
} ,
description = " Branch and revision currently reported by Flux ' s GitRepository source. " ,
) ,
stat_panel (
2 ,
" Kustomizations Ready " ,
GITOPS_KUSTOMIZATION_READY_PCT ,
{ " h " : 4 , " w " : 4 , " x " : 8 , " y " : 0 } ,
unit = " percent " ,
decimals = 1 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " yellow " , " value " : 99 } ,
{ " color " : " blue " , " value " : 100 } ,
] ,
} ,
) ,
stat_panel (
3 ,
" Kustomizations Suspended " ,
GITOPS_KUSTOMIZATION_SUSPENDED ,
{ " h " : 4 , " w " : 4 , " x " : 12 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " blue " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
) ,
stat_panel (
4 ,
" HelmReleases Ready " ,
GITOPS_HELM_READY_PCT ,
{ " h " : 4 , " w " : 4 , " x " : 16 , " y " : 0 } ,
unit = " percent " ,
decimals = 1 ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " yellow " , " value " : 99 } ,
{ " color " : " blue " , " value " : 100 } ,
] ,
} ,
) ,
stat_panel (
5 ,
" HelmReleases Suspended " ,
GITOPS_HELM_SUSPENDED ,
{ " h " : 4 , " w " : 4 , " x " : 20 , " y " : 0 } ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " blue " , " value " : None } ,
{ " color " : " red " , " value " : 1 } ,
] ,
} ,
) ,
stat_panel (
6 ,
" GitOps Exporter " ,
None ,
{ " h " : 4 , " w " : 8 , " x " : 0 , " y " : 4 } ,
text_mode = " name_and_value " ,
targets = [
{ " expr " : GITOPS_SCRAPE_SUCCESS , " refId " : " A " , " legendFormat " : " Scrape Success " , " instant " : True } ,
{ " expr " : GITOPS_LAST_SCRAPE_AGE , " refId " : " B " , " legendFormat " : " Sample Age " , " instant " : True } ,
] ,
field_overrides = [
{ " matcher " : { " id " : " byName " , " options " : " Sample Age " } , " properties " : [ { " id " : " unit " , " value " : " s " } ] } ,
{
" matcher " : { " id " : " byName " , " options " : " Scrape Success " } ,
" properties " : [
{
" id " : " thresholds " ,
" value " : {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " blue " , " value " : 1 } ,
] ,
} ,
}
] ,
} ,
] ,
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " red " , " value " : None } ,
{ " color " : " blue " , " value " : 1 } ,
] ,
} ,
) ,
2026-05-15 22:07:41 -03:00
state_timeline_panel (
2026-05-15 19:37:03 -03:00
7 ,
" Readiness History " ,
2026-05-15 22:07:41 -03:00
(
f ' label_replace( { GITOPS_KUSTOMIZATION_READY_PCT } , " kind " , " Kustomizations " , " __name__ " , " .* " ) '
f ' or label_replace( { GITOPS_HELM_READY_PCT } , " kind " , " HelmReleases " , " __name__ " , " .* " ) '
) ,
2026-05-15 19:37:03 -03:00
{ " h " : 4 , " w " : 16 , " x " : 8 , " y " : 4 } ,
2026-05-15 22:07:41 -03:00
thresholds = gitops_success_thresholds ,
legend = " {{ kind}} " ,
description = " Ready percentage over time for Flux Kustomizations and HelmReleases. " ,
2026-05-15 19:37:03 -03:00
) ,
table_panel (
8 ,
" Flux Sources " ,
source_table ,
{ " h " : 8 , " w " : 24 , " x " : 0 , " y " : 8 } ,
instant = True ,
format = " table " ,
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
field_overrides = gitops_value_overrides ,
description = " A Value of 1 means Ready; 0 means not Ready. " ,
) ,
table_panel (
9 ,
" Kustomizations " ,
kustomization_table ,
{ " h " : 12 , " w " : 24 , " x " : 0 , " y " : 16 } ,
instant = True ,
format = " table " ,
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
field_overrides = gitops_value_overrides ,
description = " A Value of 1 means Ready; 0 means not Ready. The ready/reason labels come from Flux status.conditions. " ,
) ,
table_panel (
10 ,
" HelmReleases " ,
helm_table ,
{ " h " : 12 , " w " : 24 , " x " : 0 , " y " : 28 } ,
instant = True ,
format = " table " ,
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
field_overrides = gitops_value_overrides ,
description = " A Value of 1 means Ready; 0 means not Ready. Chart/version/app_version are included when Flux reports them. " ,
) ,
]
return {
" uid " : " atlas-gitops " ,
" title " : " Atlas GitOps " ,
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " gitops " , " flux " ] ,
}
2026-04-03 14:55:16 -03:00
def build_power_dashboard ( ) :
panels = [ ]
2026-04-19 14:18:41 -03:00
status_mapping = [
{
" type " : " value " ,
" options " : {
" 0 " : { " text " : " ⚡ Charging " } ,
" 1 " : { " text " : " 🔋 Discharging " } ,
} ,
}
]
2026-04-03 14:55:16 -03:00
panels . append (
2026-04-12 19:56:12 -03:00
stat_panel (
2026-04-03 14:55:16 -03:00
1 ,
2026-04-03 20:45:40 -03:00
" UPS Current Load " ,
2026-04-19 14:18:41 -03:00
None ,
2026-04-03 20:45:40 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 0 } ,
2026-04-12 19:56:12 -03:00
unit = " none " ,
decimals = 1 ,
text_mode = " name_and_value " ,
2026-04-19 14:18:41 -03:00
targets = [
{ " refId " : " A " , " expr " : ANANKE_UPS_DRAW_WATTS_DB , " legendFormat " : f " { ANANKE_UPS_DB_NAME } Draw (W) " , " instant " : True } ,
{ " refId " : " B " , " expr " : ANANKE_UPS_RUNTIME_DB , " legendFormat " : f " { ANANKE_UPS_DB_NAME } Discharge " , " instant " : True } ,
{ " refId " : " C " , " expr " : ANANKE_UPS_ON_BATTERY_DB , " legendFormat " : f " { ANANKE_UPS_DB_NAME } Status " , " instant " : True } ,
{ " refId " : " D " , " expr " : ANANKE_UPS_DRAW_WATTS_TETHYS , " legendFormat " : f " { ANANKE_UPS_TETHYS_NAME } Draw (W) " , " instant " : True } ,
{ " refId " : " E " , " expr " : ANANKE_UPS_RUNTIME_TETHYS , " legendFormat " : f " { ANANKE_UPS_TETHYS_NAME } Discharge " , " instant " : True } ,
{ " refId " : " F " , " expr " : ANANKE_UPS_ON_BATTERY_TETHYS , " legendFormat " : f " { ANANKE_UPS_TETHYS_NAME } Status " , " instant " : True } ,
] ,
2026-04-03 20:45:40 -03:00
field_overrides = [
2026-04-19 14:18:41 -03:00
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_DB_NAME } Draw (W) " } ,
" properties " : [ { " id " : " unit " , " value " : " watt " } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_DB_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_TETHYS_NAME } Draw (W) " } ,
" properties " : [ { " id " : " unit " , " value " : " watt " } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_TETHYS_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_DB_NAME } Discharge " } ,
" properties " : [ { " id " : " unit " , " value " : " s " } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_DB_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_TETHYS_NAME } Discharge " } ,
" properties " : [ { " id " : " unit " , " value " : " s " } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_TETHYS_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_DB_NAME } Status " } ,
" properties " : [ { " id " : " mappings " , " value " : status_mapping } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_DB_NODE } " } ] ,
} ,
{
" matcher " : { " id " : " byName " , " options " : f " { ANANKE_UPS_TETHYS_NAME } Status " } ,
" properties " : [ { " id " : " mappings " , " value " : status_mapping } , { " id " : " description " , " value " : f " Attached node: { ANANKE_UPS_TETHYS_NODE } " } ] ,
} ,
2026-04-03 20:45:40 -03:00
] ,
2026-04-19 14:18:41 -03:00
orientation = " horizontal " ,
wide_layout = True ,
2026-04-03 20:45:40 -03:00
description = (
2026-04-19 14:18:41 -03:00
" Per-UPS live snapshot: current draw in watts, estimated battery runtime if discharge started now, and charging/discharging status. "
2026-04-03 20:45:40 -03:00
) ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-05-15 22:07:41 -03:00
apply_bar_timeseries_style (
timeseries_panel (
2 ,
" UPS History (Power Draw) " ,
None ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 0 } ,
unit = " watt " ,
targets = [
{ " refId " : " A " , " expr " : ANANKE_UPS_DRAW_WATTS_DB_SERIES , " legendFormat " : ANANKE_UPS_DB_NAME } ,
{ " refId " : " B " , " expr " : ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES , " legendFormat " : ANANKE_UPS_TETHYS_NAME } ,
] ,
2026-05-15 22:43:44 -03:00
field_overrides = fixed_color_overrides (
{ ANANKE_UPS_DB_NAME : " dark-blue " , ANANKE_UPS_TETHYS_NAME : " dark-yellow " }
) ,
2026-05-15 22:07:41 -03:00
legend_display = " table " ,
legend_placement = " right " ,
description = " Historical UPS power consumption in watts for titan-db and tethys. " ,
) ,
stacked = False ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-12 19:56:12 -03:00
stat_panel (
2026-04-03 14:55:16 -03:00
3 ,
2026-04-03 20:45:40 -03:00
" Current Climate " ,
2026-04-19 14:18:41 -03:00
None ,
2026-04-03 17:49:09 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 8 } ,
2026-04-12 19:56:12 -03:00
unit = " none " ,
decimals = 2 ,
text_mode = " name_and_value " ,
2026-04-19 14:18:41 -03:00
targets = [
{ " refId " : " A " , " expr " : CLIMATE_TEMP_MAX , " legendFormat " : " Tent Temp (°C) " , " instant " : True } ,
{ " refId " : " B " , " expr " : CLIMATE_PRESSURE_CURRENT , " legendFormat " : " Tent VPD (kPa) " , " instant " : True } ,
{ " refId " : " C " , " expr " : CLIMATE_HUMIDITY_MAX , " legendFormat " : " Tent RH ( % ) " , " instant " : True } ,
{ " refId " : " D " , " expr " : CLIMATE_DEWPOINT_CURRENT , " legendFormat " : " Dew Point (°C) " , " instant " : True } ,
] ,
2026-04-03 20:45:40 -03:00
field_overrides = [
2026-04-19 14:18:41 -03:00
{ " matcher " : { " id " : " byName " , " options " : " Tent Temp (°C) " } , " properties " : [ { " id " : " unit " , " value " : " celsius " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Tent VPD (kPa) " } , " properties " : [ { " id " : " unit " , " value " : " suffix:kPa " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Tent RH ( % ) " } , " properties " : [ { " id " : " unit " , " value " : " percent " } ] } ,
{ " matcher " : { " id " : " byName " , " options " : " Dew Point (°C) " } , " properties " : [ { " id " : " unit " , " value " : " celsius " } ] } ,
2026-04-03 20:45:40 -03:00
] ,
2026-04-19 14:18:41 -03:00
orientation = " horizontal " ,
wide_layout = True ,
description = " Current tent temperature, humidity, VPD, and dew point. These render once Typhon climate telemetry is online. " ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
timeseries_panel (
2026-04-03 20:45:40 -03:00
4 ,
" Climate History " ,
None ,
2026-04-03 17:49:09 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 8 } ,
2026-04-19 14:18:41 -03:00
unit = " celsius " ,
2026-04-03 20:45:40 -03:00
targets = [
{ " refId " : " A " , " expr " : CLIMATE_TEMP_SERIES , " legendFormat " : " Temperature (°C) " } ,
2026-04-19 14:18:41 -03:00
{ " refId " : " B " , " expr " : CLIMATE_HUMIDITY_SERIES , " legendFormat " : " Humidity ( % ) " } ,
{ " refId " : " C " , " expr " : CLIMATE_PRESSURE_SERIES , " legendFormat " : " VPD (kPa) " } ,
{ " refId " : " D " , " expr " : CLIMATE_DEWPOINT_SERIES , " legendFormat " : " Dew Point (°C) " } ,
2026-04-03 20:45:40 -03:00
] ,
field_overrides = [
{
2026-04-12 17:28:15 -03:00
" matcher " : { " id " : " byName " , " options " : " Humidity ( % ) " } ,
" properties " : [
2026-04-19 14:18:41 -03:00
{ " id " : " unit " , " value " : " percent " } ,
2026-04-12 17:28:15 -03:00
] ,
} ,
{
2026-04-19 14:18:41 -03:00
" matcher " : { " id " : " byName " , " options " : " VPD (kPa) " } ,
2026-04-03 20:45:40 -03:00
" properties " : [
2026-04-19 14:18:41 -03:00
{ " id " : " unit " , " value " : " none " } ,
2026-04-03 20:45:40 -03:00
{ " id " : " custom.axisPlacement " , " value " : " right " } ,
2026-04-19 14:18:41 -03:00
{ " id " : " custom.axisLabel " , " value " : " kPa " } ,
2026-04-03 20:45:40 -03:00
{ " id " : " decimals " , " value " : 2 } ,
] ,
}
] ,
2026-04-03 14:55:16 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2026-04-19 14:18:41 -03:00
description = " Two-axis chart: tent temperature/humidity/dew point (left axis) and tent VPD in kPa (right axis). " ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-12 19:56:12 -03:00
stat_panel (
2026-04-03 20:45:40 -03:00
5 ,
" Fan Activity " ,
2026-04-19 14:18:41 -03:00
None ,
2026-04-03 17:49:09 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 16 } ,
2026-04-12 19:56:12 -03:00
unit = " none " ,
decimals = 0 ,
text_mode = " name_and_value " ,
2026-04-19 14:18:41 -03:00
targets = [
{ " refId " : " A " , " expr " : f " round( { CLIMATE_FAN_OUTLET_CURRENT } ) " , " legendFormat " : " Inside Outlet " , " instant " : True } ,
{ " refId " : " B " , " expr " : f " round( { CLIMATE_FAN_INSIDE_INLET_CURRENT } ) " , " legendFormat " : " Inside Inlet " , " instant " : True } ,
{ " refId " : " C " , " expr " : f " round( { CLIMATE_FAN_OUTSIDE_INLET_CURRENT } ) " , " legendFormat " : " Outside Inlet " , " instant " : True } ,
{ " refId " : " D " , " expr " : f " round( { CLIMATE_FAN_INTERIOR_CURRENT } ) " , " legendFormat " : " Interior Fans " , " instant " : True } ,
] ,
2026-04-12 19:56:12 -03:00
thresholds = {
" mode " : " absolute " ,
" steps " : [
{ " color " : " green " , " value " : None } ,
{ " color " : " yellow " , " value " : 7 } ,
{ " color " : " red " , " value " : 9 } ,
] ,
} ,
2026-04-19 14:18:41 -03:00
orientation = " horizontal " ,
wide_layout = True ,
description = " Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans. " ,
2026-04-03 14:55:16 -03:00
)
)
panels . append (
2026-04-03 17:49:09 -03:00
timeseries_panel (
2026-04-03 20:45:40 -03:00
6 ,
2026-05-16 05:08:09 -03:00
" Fan Intensity History " ,
2026-04-03 20:45:40 -03:00
None ,
2026-04-03 17:49:09 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 16 } ,
unit = " none " ,
2026-04-03 20:45:40 -03:00
max_value = 10 ,
targets = [
2026-04-19 14:18:41 -03:00
{ " refId " : " A " , " expr " : CLIMATE_FAN_OUTLET_SERIES , " legendFormat " : " Inside Outlet " } ,
{ " refId " : " B " , " expr " : CLIMATE_FAN_INSIDE_INLET_SERIES , " legendFormat " : " Inside Inlet " } ,
{ " refId " : " C " , " expr " : CLIMATE_FAN_OUTSIDE_INLET_SERIES , " legendFormat " : " Outside Inlet " } ,
{ " refId " : " D " , " expr " : CLIMATE_FAN_INTERIOR_SERIES , " legendFormat " : " Interior Fans " } ,
2026-04-03 20:45:40 -03:00
] ,
2026-04-03 17:49:09 -03:00
legend_display = " table " ,
legend_placement = " right " ,
2026-04-03 20:45:40 -03:00
description = " Historical fan activity for all four fan groups (0-10 scale). " ,
2026-04-03 14:55:16 -03:00
)
)
return {
" uid " : " atlas-power " ,
" title " : " Atlas Power " ,
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-24h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " power " , " climate " ] ,
}
2025-12-02 13:16:00 -03:00
def build_gpu_dashboard ( ) :
panels = [ ]
2026-01-01 14:44:33 -03:00
gpu_scope = " $namespace_scope_gpu "
2025-12-02 13:16:00 -03:00
panels . append (
pie_panel (
1 ,
2025-12-02 14:41:39 -03:00
" Namespace GPU Share " ,
2026-01-01 14:44:33 -03:00
namespace_gpu_share_expr ( gpu_scope ) ,
2025-12-02 13:16:00 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 0 } ,
2026-01-01 14:44:33 -03:00
links = namespace_scope_links ( " namespace_scope_gpu " ) ,
2026-01-18 02:50:07 -03:00
description = " Shares are normalized within the selected filter. Switching scope changes the denominator. " ,
2025-12-02 13:16:00 -03:00
)
)
panels . append (
timeseries_panel (
2 ,
2025-12-02 14:41:39 -03:00
" GPU Util by Namespace " ,
2026-01-01 14:44:33 -03:00
namespace_gpu_usage_instant ( gpu_scope ) ,
2025-12-02 13:16:00 -03:00
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 0 } ,
unit = " percent " ,
legend = " {{ namespace}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
timeseries_panel (
3 ,
2025-12-02 14:41:39 -03:00
" GPU Util by Node " ,
2026-01-27 21:43:37 -03:00
gpu_util_by_hostname ( ) ,
2025-12-02 13:16:00 -03:00
{ " h " : 8 , " w " : 12 , " x " : 0 , " y " : 8 } ,
unit = " percent " ,
legend = " {{ Hostname}} " ,
legend_display = " table " ,
legend_placement = " right " ,
)
)
panels . append (
table_panel (
4 ,
2025-12-02 14:41:39 -03:00
" Top Pods by GPU Util " ,
2025-12-02 13:16:00 -03:00
' topk(10, sum(DCGM_FI_DEV_GPU_UTIL { pod!= " " }) by (namespace,pod,Hostname)) ' ,
{ " h " : 8 , " w " : 12 , " x " : 12 , " y " : 8 } ,
unit = " percent " ,
transformations = [ { " id " : " labelsToFields " , " options " : { } } ] ,
)
)
return {
" uid " : " atlas-gpu " ,
" title " : " Atlas GPU " ,
" folderUid " : PRIVATE_FOLDER ,
" editable " : True ,
" panels " : panels ,
" time " : { " from " : " now-12h " , " to " : " now " } ,
" annotations " : { " list " : [ ] } ,
" schemaVersion " : 39 ,
" style " : " dark " ,
" tags " : [ " atlas " , " gpu " ] ,
2026-01-01 14:44:33 -03:00
" templating " : {
" list " : [
namespace_scope_variable ( " namespace_scope_cpu " , " CPU namespace filter " ) ,
namespace_scope_variable ( " namespace_scope_gpu " , " GPU namespace filter " ) ,
namespace_scope_variable ( " namespace_scope_ram " , " RAM namespace filter " ) ,
]
} ,
2025-12-02 13:16:00 -03:00
}
2025-11-17 14:22:46 -03:00
DASHBOARDS = {
" atlas-overview " : {
" builder " : build_overview ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-overview.yaml " ,
} ,
" atlas-pods " : {
" builder " : build_pods_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-pods.yaml " ,
} ,
" atlas-nodes " : {
" builder " : build_nodes_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-nodes.yaml " ,
} ,
" atlas-storage " : {
" builder " : build_storage_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-storage.yaml " ,
} ,
2025-11-17 16:27:38 -03:00
" atlas-network " : {
" builder " : build_network_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-network.yaml " ,
} ,
2026-01-05 21:55:59 -03:00
" atlas-mail " : {
" builder " : build_mail_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-mail.yaml " ,
} ,
2026-04-22 02:26:31 -03:00
" atlas-testing " : {
" builder " : build_testing_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-testing.yaml " ,
} ,
2026-05-15 19:37:03 -03:00
" atlas-gitops " : {
" builder " : build_gitops_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-gitops.yaml " ,
} ,
2026-04-03 14:55:16 -03:00
" atlas-power " : {
" builder " : build_power_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-power.yaml " ,
} ,
2025-12-02 13:16:00 -03:00
" atlas-gpu " : {
" builder " : build_gpu_dashboard ,
" configmap " : ROOT / " services " / " monitoring " / " grafana-dashboard-gpu.yaml " ,
} ,
2025-11-17 14:22:46 -03:00
}
2025-11-17 16:27:38 -03:00
def write_json ( uid , data ) :
2025-11-17 14:22:46 -03:00
DASHBOARD_DIR . mkdir ( parents = True , exist_ok = True )
path = DASHBOARD_DIR / f " { uid } .json "
2026-05-15 19:52:46 -03:00
data = apply_global_status_palette ( data )
2025-11-17 14:22:46 -03:00
path . write_text ( json . dumps ( data , indent = 2 ) + " \n " )
2025-11-17 16:27:38 -03:00
def render_configmap ( uid , info ) :
2025-11-17 14:22:46 -03:00
json_path = DASHBOARD_DIR / f " { uid } .json "
2026-05-15 19:52:46 -03:00
payload = json . dumps ( apply_global_status_palette ( json . loads ( json_path . read_text ( ) ) ) , indent = 2 )
2025-11-17 14:22:46 -03:00
indented = " \n " . join ( " " + line for line in payload . splitlines ( ) )
2025-11-17 16:27:38 -03:00
output_path = info [ " configmap " ]
2025-11-17 14:22:46 -03:00
content = CONFIG_TEMPLATE . format (
relative_path = output_path . relative_to ( ROOT ) ,
name = output_path . stem ,
key = json_path . name ,
payload = indented ,
)
output_path . write_text ( content )
print ( f " Rendered { json_path . name } -> { output_path . relative_to ( ROOT ) } " )
def main ( ) :
parser = argparse . ArgumentParser ( description = __doc__ )
parser . add_argument ( " --build " , action = " store_true " , help = " Regenerate dashboard JSON files from builders " )
args = parser . parse_args ( )
if args . build :
for uid , info in DASHBOARDS . items ( ) :
write_json ( uid , info [ " builder " ] ( ) )
for uid , info in DASHBOARDS . items ( ) :
render_configmap ( uid , info )
if __name__ == " __main__ " :
main ( )