#!/usr/bin/env python3 """Generate Atlas Grafana dashboards and render them into ConfigMaps. Usage: scripts/dashboards_render_atlas.py --build # rebuild JSON + ConfigMaps scripts/dashboards_render_atlas.py # re-render ConfigMaps from JSON """ import argparse import json import textwrap import urllib.parse from pathlib import Path # --------------------------------------------------------------------------- # Paths, folders, and shared metadata # --------------------------------------------------------------------------- ROOT = Path(__file__).resolve().parents[1] DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards" CONFIG_TEMPLATE = textwrap.dedent( """# {relative_path} apiVersion: v1 kind: ConfigMap metadata: name: {name} labels: grafana_dashboard: "1" data: {key}: | {payload} """ ) PROM_DS = {"type": "prometheus", "uid": "atlas-vm"} PUBLIC_FOLDER = "overview" PUBLIC_DASHBOARD_FOLDER = "atlas-public" PRIVATE_FOLDER = "atlas-internal" ASTRAIOS_MOUNTPOINT = "/mnt/astraios" GLOBAL_STATUS_COLOR_TONES = { "blue": "dark-blue", "green": "dark-green", "yellow": "dark-yellow", "orange": "dark-orange", "red": "dark-red", } COLOR_VALUE_KEYS = {"color", "fixedColor"} def apply_global_status_palette(value, parent_key=None): """Normalize generated Grafana status colors to the shared Atlas tones.""" if isinstance(value, dict): return {key: apply_global_status_palette(item, key) for key, item in value.items()} if isinstance(value, list): return [apply_global_status_palette(item, parent_key) for item in value] if parent_key in COLOR_VALUE_KEYS and isinstance(value, str): return GLOBAL_STATUS_COLOR_TONES.get(value, value) return value PERCENT_THRESHOLDS = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, {"color": "orange", "value": 75}, {"color": "red", "value": 91.5}, ], } NAMESPACE_CPU_WINDOW = "1m" # --------------------------------------------------------------------------- # Cluster metadata # --------------------------------------------------------------------------- CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"] CONTROL_DEPENDENCIES = ["titan-db", "titan-jh"] CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES WORKER_NODES = [ "titan-04", "titan-05", "titan-06", "titan-07", "titan-08", "titan-09", "titan-10", "titan-11", "titan-20", "titan-21", "titan-12", "titan-13", "titan-14", "titan-15", "titan-16", "titan-17", "titan-18", "titan-19", "titan-22", "titan-24", ] CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES) CONTROL_ALL_REGEX = "|".join(CONTROL_ALL) WORKER_REGEX = "|".join(WORKER_NODES) CONTROL_TOTAL = len(CONTROL_PLANE_NODES) WORKER_TOTAL = len(WORKER_NODES) CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" # Namespaces considered infrastructure (excluded from workload counts) INFRA_PATTERNS = [ "kube-.*", ".*-system", "traefik", "monitoring", "logging", "cert-manager", "maintenance", "postgres", ] INFRA_REGEX = f"^({'|'.join(INFRA_PATTERNS)})$" # Namespaces allowed on control plane without counting as workloads CP_ALLOWED_NS = INFRA_REGEX LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4] CONTROL_WORKLOADS_EXPR = ( f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)' ) # --------------------------------------------------------------------------- # PromQL helpers # --------------------------------------------------------------------------- NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")' def node_filter(regex): """Return a selector that evaluates to 1 for nodes matching the regex.""" return ( f'label_replace(node_uname_info{{nodename=~"{regex}"}}, ' '"node", "$1", "nodename", "(.*)")' ) def scoped_node_expr(base, scope=""): """Attach nodename metadata and optionally filter to a scope regex.""" expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})" if scope: expr = f"({expr}) * on(node) group_left() {node_filter(scope)}" return expr def node_cpu_expr(scope=""): idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))' base = f"(1 - {idle}) * 100" return scoped_node_expr(base, scope) def node_mem_expr(scope=""): usage = ( "avg by (instance) (" "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) " "/ node_memory_MemTotal_bytes * 100)" ) return scoped_node_expr(usage, scope) def filesystem_usage_expr(mount, scope=""): base = ( f'avg by (instance) (' f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} ' f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)' ) return scoped_node_expr(base, scope) def root_usage_expr(scope=""): return filesystem_usage_expr("/", scope) def astraios_usage_expr(scope=""): return filesystem_usage_expr(ASTRAIOS_MOUNTPOINT, scope) def astreae_usage_expr(mount): return ( f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / " f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)" ) def astreae_free_expr(mount): return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" def topk_with_node(expr): return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")' def node_net_expr(scope=""): base = ( 'sum by (instance) (' 'rate(node_network_receive_bytes_total{device!~"lo"}[5m]) ' '+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))' ) return scoped_node_expr(base, scope) def node_io_expr(scope=""): base = ( "sum by (instance) (rate(node_disk_read_bytes_total[5m]) " "+ rate(node_disk_written_bytes_total[5m]))" ) return scoped_node_expr(base, scope) def namespace_selector(scope_var): return f'namespace!="",pod!="",container!="",container!="POD",{scope_var}' def namespace_gpu_selector(scope_var): return f'namespace!="",pod!="",{scope_var}' def namespace_cpu_raw(scope_var): return ( "sum(rate(container_cpu_usage_seconds_total" f"{{{namespace_selector(scope_var)}}}[{NAMESPACE_CPU_WINDOW}])) by (namespace)" ) def namespace_ram_raw(scope_var): return f"sum(container_memory_working_set_bytes{{{namespace_selector(scope_var)}}}) by (namespace)" def namespace_gpu_usage_instant(scope_var): return gpu_usage_by_namespace(scope_var) def jetson_gpu_util_by_node(): return 'max by (node) (jetson_gr3d_freq_percent{node!=""})' def dcgm_gpu_util_by_node(): dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")' dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")' return ( "avg by (node) (" f"{dcgm_ns} * on(namespace,pod) group_left(node) " 'kube_pod_info{namespace="monitoring"}' ")" ) def gpu_util_by_node(): return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}" def gpu_util_by_hostname(): return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")' GPU_RESOURCE_REGEX = "nvidia(_com_|[.]com/)gpu.*" def gpu_node_labels(): return f'max by (node) (kube_node_status_allocatable{{resource=~"{GPU_RESOURCE_REGEX}"}} > bool 0)' def gpu_requests_by_namespace_node(scope_var): return ( "sum by (namespace,node) (" f'kube_pod_container_resource_requests{{resource=~"{GPU_RESOURCE_REGEX}",{scope_var}}} ' "* on(namespace,pod) group_left(node) kube_pod_info " f"* on(node) group_left() ({gpu_node_labels()})" ")" ) def gpu_usage_by_namespace(scope_var): requests_by_ns = gpu_requests_by_namespace_node(scope_var) total_by_node = f"sum by (node) ({requests_by_ns})" return ( "sum by (namespace) (" f"({requests_by_ns}) / on(node) group_left() clamp_min({total_by_node}, 1) " f"* on(node) group_left() ({gpu_util_by_node()})" ")" ) def jetson_gpu_usage_by_namespace(scope_var): requests_by_ns = gpu_requests_by_namespace_node(scope_var) total_by_node = f"sum by (node) ({requests_by_ns})" return ( "sum by (namespace) (" f"({requests_by_ns}) / on(node) group_left() clamp_min({total_by_node}, 1) " f"* on(node) group_left() {jetson_gpu_util_by_node()}" ")" ) def namespace_share_expr(resource_expr): total = f"clamp_min(sum( {resource_expr} ), 1)" return f"100 * ( {resource_expr} ) / {total}" def namespace_cpu_share_expr(scope_var): return namespace_share_expr(namespace_cpu_raw(scope_var)) def namespace_ram_share_expr(scope_var): return namespace_share_expr(namespace_ram_raw(scope_var)) def namespace_gpu_share_expr(scope_var): usage = namespace_gpu_usage_instant(scope_var) total = f"(sum({usage}) or on() vector(0))" share = f"100 * ({usage}) / clamp_min({total}, 1)" idle = 'label_replace(vector(100), "namespace", "idle", "", "") * scalar(' + total + " == bool 0)" return f"({share}) or ({idle})" PROBLEM_PODS_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) ' "or on() vector(0)" ) CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' '{reason=~"CrashLoopBackOff|ImagePullBackOff"})) ' "or on() vector(0)" ) STUCK_TERMINATING_EXPR = ( 'sum(max by (namespace,pod) (' '((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)' ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)' ')) ' "or on() vector(0)" ) UPTIME_WINDOW = "365d" # vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series. UPTIME_RECORDING_METRIC = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}' UPTIME_RECORDING_EXPR = f"last_over_time({UPTIME_RECORDING_METRIC}[24h])" TRAEFIK_READY_EXPR = ( "(" 'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})' " / clamp_min(" 'sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)' ")" ) CONTROL_READY_FRACTION_EXPR = ( f"(sum(kube_node_status_condition{{condition=\"Ready\",status=\"true\",node=~\"{CONTROL_REGEX}\"}})" f" / {CONTROL_TOTAL})" ) UPTIME_AVAIL_EXPR = ( f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))" ) # Tie-breaker to deterministically pick one node per namespace when shares tie. NODE_TIEBREAKER = " + ".join( f"({node_filter(node)}) * 1e-6 * {idx}" for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1) ) UPTIME_AVG_EXPR = UPTIME_RECORDING_EXPR UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))" UPTIME_THRESHOLDS = { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": 2}, {"color": "yellow", "value": 3}, {"color": "green", "value": 3.5}, ], } UPTIME_PERCENT_THRESHOLDS = { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": 0.99}, {"color": "yellow", "value": 0.999}, {"color": "green", "value": 0.9999}, {"color": "blue", "value": 0.99999}, ], } PROBLEM_TABLE_EXPR = ( "(time() - kube_pod_created{pod!=\"\"}) " "* on(namespace,pod) group_left(node) kube_pod_info " "* on(namespace,pod) group_left(phase) " "max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" ) CRASHLOOP_TABLE_EXPR = ( "(time() - kube_pod_created{pod!=\"\"}) " "* on(namespace,pod) group_left(node) kube_pod_info " "* on(namespace,pod,container) group_left(reason) " "max by (namespace,pod,container,reason) " "(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" ) STUCK_TABLE_EXPR = ( "(" "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) " "and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) " "* on(namespace,pod) group_left(node) kube_pod_info" ")" ) NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"' NAMESPACE_SCOPE_ALL = 'namespace=~".*"' NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"' NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"] def promql_task_regex(tasks): """Return a PromQL-safe regex alternation for the provided task names.""" return "|".join(tasks) ARIADNE_ALL_SCHEDULE_TASKS = [ "schedule.mailu_sync", "schedule.nextcloud_sync", "schedule.nextcloud_cron", "schedule.nextcloud_maintenance", "schedule.vaultwarden_sync", "schedule.wger_user_sync", "schedule.wger_admin", "schedule.firefly_user_sync", "schedule.firefly_cron", "schedule.vault_k8s_auth", "schedule.vault_oidc", "schedule.comms_guest_name", "schedule.comms_pin_invite", "schedule.comms_reset_room", "schedule.comms_seed_room", "schedule.pod_cleaner", "schedule.opensearch_prune", "schedule.image_sweeper", "schedule.metis_k3s_token_sync", "schedule.platform_quality_suite_probe", ] ARIADNE_FAST_SCHEDULE_TASKS = [ task for task in ARIADNE_ALL_SCHEDULE_TASKS if task not in {"schedule.comms_pin_invite", "schedule.comms_reset_room"} ] ARIADNE_SCHEDULE_HEALTH_TASKS = [ "schedule.nextcloud_sync", "schedule.nextcloud_cron", "schedule.vaultwarden_sync", "schedule.wger_user_sync", "schedule.firefly_user_sync", "schedule.comms_guest_name", "schedule.comms_seed_room", "schedule.pod_cleaner", "schedule.image_sweeper", "schedule.metis_k3s_token_sync", "schedule.platform_quality_suite_probe", ] ARIADNE_ALL_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_ALL_SCHEDULE_TASKS)})$"' ARIADNE_FAST_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_FAST_SCHEDULE_TASKS)})$"' ARIADNE_SCHEDULE_HEALTH_FILTER = f'task=~"^({promql_task_regex(ARIADNE_SCHEDULE_HEALTH_TASKS)})$"' ARIADNE_ALL_SCHEDULE_NEXT_RUN = f"ariadne_schedule_next_run_timestamp_seconds{{{ARIADNE_ALL_SCHEDULE_FILTER}}}" ARIADNE_ALL_SCHEDULE_LAST_SUCCESS = ( f"ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_ALL_SCHEDULE_FILTER}}}" ) ARIADNE_ALL_SCHEDULE_LAST_ERROR = f"ariadne_schedule_last_error_timestamp_seconds{{{ARIADNE_ALL_SCHEDULE_FILTER}}}" ARIADNE_ALL_SCHEDULE_LAST_STATUS = f"ariadne_schedule_last_status{{{ARIADNE_ALL_SCHEDULE_FILTER}}}" ARIADNE_FAST_SCHEDULE_LAST_SUCCESS = ( f"ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_FAST_SCHEDULE_FILTER}}}" ) ARIADNE_FAST_SCHEDULE_LAST_ERROR = f"ariadne_schedule_last_error_timestamp_seconds{{{ARIADNE_FAST_SCHEDULE_FILTER}}}" ARIADNE_FAST_SCHEDULE_LAST_STATUS = f"ariadne_schedule_last_status{{{ARIADNE_FAST_SCHEDULE_FILTER}}}" ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS = ( f"ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_SCHEDULE_HEALTH_FILTER}}}" ) ARIADNE_HEALTH_SCHEDULE_LAST_STATUS = f"ariadne_schedule_last_status{{{ARIADNE_SCHEDULE_HEALTH_FILTER}}}" ARIADNE_SCHEDULE_LAST_SUCCESS_AGE = f"(time() - {ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS})" ARIADNE_SCHEDULE_LAST_ERROR_AGE = f"(time() - {ARIADNE_ALL_SCHEDULE_LAST_ERROR})" ARIADNE_SCHEDULE_LAST_SUCCESS_AGE_HOURS = f"({ARIADNE_SCHEDULE_LAST_SUCCESS_AGE}) / 3600" ARIADNE_SCHEDULE_LAST_ERROR_AGE_HOURS = f"({ARIADNE_SCHEDULE_LAST_ERROR_AGE}) / 3600" ARIADNE_SCHEDULE_STALE_WINDOW_SEC = 36 * 3600 ARIADNE_SCHEDULE_STALE = f"(({ARIADNE_SCHEDULE_LAST_SUCCESS_AGE}) > bool {ARIADNE_SCHEDULE_STALE_WINDOW_SEC})" ARIADNE_SCHEDULE_MISSING = ( f"({ARIADNE_ALL_SCHEDULE_NEXT_RUN} unless on(task) {ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS})" ) ARIADNE_SCHEDULE_FAILED = f"((1 - {ARIADNE_HEALTH_SCHEDULE_LAST_STATUS}) > bool 0)" ARIADNE_SCHEDULE_STALE_COUNT = f"sum({ARIADNE_SCHEDULE_STALE}) or on() vector(0)" ARIADNE_SCHEDULE_MISSING_COUNT = f"count({ARIADNE_SCHEDULE_MISSING}) or on() vector(0)" ARIADNE_SCHEDULE_FAILED_COUNT = f"sum({ARIADNE_SCHEDULE_FAILED}) or on() vector(0)" ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))' ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))' ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))' ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))' ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))' ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[5m]))' ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[5m]))' ARIADNE_TASK_WARNINGS_SERIES = ( 'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)' ) ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = f"(time() - {ARIADNE_ALL_SCHEDULE_LAST_SUCCESS}) / 3600" ARIADNE_SCHEDULE_LAST_ERROR_HOURS = f"(time() - {ARIADNE_ALL_SCHEDULE_LAST_ERROR}) / 3600" ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = ( f"(time() - max_over_time({ARIADNE_ALL_SCHEDULE_LAST_SUCCESS}[$__range])) / 3600" ) ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = ( f"(time() - max_over_time({ARIADNE_ALL_SCHEDULE_LAST_ERROR}[$__range])) / 3600" ) ARIADNE_FAST_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = ( f"(time() - max_over_time({ARIADNE_FAST_SCHEDULE_LAST_SUCCESS}[$__range])) / 3600" ) ARIADNE_FAST_SCHEDULE_NEXT_RUN_HOURS = f"(({ARIADNE_ALL_SCHEDULE_NEXT_RUN} - time()) / 3600)" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" PLATFORM_TEST_SUITE_NAMES = [ "ariadne", "metis", "ananke", "atlasbot", "pegasus", "soteria", "titan_iac", "bstein_home", "data_prepper", ] PLATFORM_TEST_SUCCESS_STATUS = "ok|passed|success" PLATFORM_TEST_NON_FAILURE_STATUS = f"{PLATFORM_TEST_SUCCESS_STATUS}|not_applicable|skipped|na|n/a" PLATFORM_TEST_CI_JOB = "platform-quality-ci" PLATFORM_TEST_EXPORT_FILTER = f'exported_job="{PLATFORM_TEST_CI_JOB}"' PLATFORM_TEST_SUITE_VALUE_BY_NAME = { "ariadne": "ariadne", "metis": "metis", "ananke": "ananke", "atlasbot": "atlasbot", "pegasus": "pegasus|pegasus-health|pegasus_health", "soteria": "soteria", "titan_iac": "titan_iac|titan-iac", "bstein_home": "bstein_home|bstein-home", "data_prepper": "data_prepper|data-prepper", } PLATFORM_TEST_JENKINS_JOB_BY_SUITE = { "ariadne": "ariadne", "metis": "metis", "ananke": "ananke", "atlasbot": "atlasbot", "pegasus": "pegasus", "soteria": "Soteria", "titan_iac": "titan-iac", "bstein_home": "bstein-dev-home", "data_prepper": "data-prepper", } JENKINS_UI_BASE_DEFAULT = "https://ci.bstein.dev" PLATFORM_TEST_SUITE_MATCHER = "|".join( PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite) for suite in PLATFORM_TEST_SUITE_NAMES ) PLATFORM_TEST_SUITE_CANONICAL_MATCHER = "|".join(PLATFORM_TEST_SUITE_NAMES) PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER = PLATFORM_TEST_SUITE_CANONICAL_MATCHER PLATFORM_TEST_SUCCESS_EVENTS_30D = ( f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[30d])) or on() vector(0))' ) PLATFORM_TEST_TOTAL_EVENTS_30D = ( f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[30d])) or on() vector(0))' ) PLATFORM_TEST_SUCCESS_EVENTS_7D = ( f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[7d])) or on() vector(0))' ) PLATFORM_TEST_TOTAL_EVENTS_7D = ( f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[7d])) or on() vector(0))' ) PLATFORM_TEST_SUCCESS_EVENTS_24H = ( f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h])) or on() vector(0))' ) PLATFORM_TEST_TOTAL_EVENTS_24H = ( f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h])) or on() vector(0))' ) TEST_SUCCESS_RATE = ( f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_30D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_30D}), 1)" ) TEST_SUCCESS_RATE_7D = ( f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_7D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_7D}), 1)" ) TEST_SUCCESS_RATE_24H = ( f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_24H}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_24H}), 1)" ) TEST_FAILURES_24H_TOTAL = ( f'(sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status!~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h])) or on() vector(0))' ) PLATFORM_TEST_FAILURES_24H_BY_SUITE = ( f'sort_desc(sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status!~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h])))' ) PLATFORM_TEST_ACTIVITY_30D = ( f'sum by (suite, status) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[30d]))' ) PLATFORM_TEST_RUNS_24H_TOTAL = PLATFORM_TEST_TOTAL_EVENTS_24H PLATFORM_TEST_ACTIVE_SUITES_24H = ( f'sum((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h])) > 0)) ' "or on() vector(0)" ) PLATFORM_TEST_POINT_WINDOW = "1h" PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS = [ { "refId": chr(ord("A") + index), "expr": ( f'(100 * (sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite)}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}' f'[{PLATFORM_TEST_POINT_WINDOW}]))) / ' f'clamp_min((sum(increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite)}",{PLATFORM_TEST_EXPORT_FILTER}}}[{PLATFORM_TEST_POINT_WINDOW}]))), 1))' ), "legendFormat": suite, } for index, suite in enumerate(PLATFORM_TEST_SUITE_NAMES) ] PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = ( f'sort_desc(100 * (sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h]))) ' f'/ clamp_min((sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[24h]))), 1))' ) QUALITY_GATE_SUITE_INDEX_30D = ( f'sum by (suite) (increase(platform_quality_gate_runs_total{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",{PLATFORM_TEST_EXPORT_FILTER}}}[30d]))' ) QUALITY_GATE_COVERAGE_BY_SUITE = ( f'(max by (suite) ({{__name__=~".*_quality_gate_coverage_percent",{PLATFORM_TEST_EXPORT_FILTER}}})) ' f'or on(suite) (max by (suite) (platform_quality_gate_workspace_line_coverage_percent{{{PLATFORM_TEST_EXPORT_FILTER}}}))' ) QUALITY_GATE_COVERAGE_BY_SUITE_WITH_MISSING = ( f"({QUALITY_GATE_COVERAGE_BY_SUITE}) or on(suite) (0 * ({QUALITY_GATE_SUITE_INDEX_30D}) - 1)" ) QUALITY_GATE_COVERAGE_GAP_BY_SUITE = ( f"clamp_min(95 - ({QUALITY_GATE_COVERAGE_BY_SUITE}), 0)" ) QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE = ( f"max by (suite) (platform_quality_gate_source_lines_over_500_total{{{PLATFORM_TEST_EXPORT_FILTER}}})" ) QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE_WITH_MISSING = ( f"({QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE}) or on(suite) (0 * ({QUALITY_GATE_SUITE_INDEX_30D}) - 1)" ) PLATFORM_TEST_CHECKS_SELECTOR = ( f'__name__=~".*_quality_gate_checks_total",suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",' f"{PLATFORM_TEST_EXPORT_FILTER}" ) PLATFORM_TEST_CURRENT_GATE_HEALTH_BY_SUITE = ( f'(100 * sum by (suite) (max by (suite, check) ' f'(({{{PLATFORM_TEST_CHECKS_SELECTOR},result=~"{PLATFORM_TEST_NON_FAILURE_STATUS}"}} > bool 0))) ' f'/ clamp_min(sum by (suite) (max by (suite, check) ' f'(({{{PLATFORM_TEST_CHECKS_SELECTOR}}} > bool 0))), 1))' ) PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))" ANANKE_SELECTOR = 'job="ananke-power"' ANANKE_UPS_DB_NAME = "Pyrphoros" ANANKE_UPS_DB_NODE = "titan-db" ANANKE_UPS_TETHYS_NAME = "Statera" ANANKE_UPS_TETHYS_NODE = "titan-24" ANANKE_UPS_DB_SELECTOR = f'{ANANKE_SELECTOR},source="{ANANKE_UPS_DB_NAME}"' ANANKE_UPS_TETHYS_SELECTOR = f'{ANANKE_SELECTOR},source="{ANANKE_UPS_TETHYS_NAME}"' ANANKE_UPS_ON_BATTERY = f"sum(ananke_ups_on_battery{{{ANANKE_SELECTOR}}}) or on() vector(0)" ANANKE_UPS_LOW_BATTERY = f"sum(ananke_ups_low_battery{{{ANANKE_SELECTOR}}}) or on() vector(0)" ANANKE_UPS_RUNTIME_MIN = f"min(ananke_ups_runtime_seconds{{{ANANKE_SELECTOR}}}) or on() vector(0)" ANANKE_UPS_RUNTIME_HEADROOM_PERCENT = ( f"100 * min(ananke_ups_runtime_seconds{{{ANANKE_SELECTOR}}}) / " f"clamp_min(max(ananke_ups_threshold_seconds{{{ANANKE_SELECTOR}}}), 1)" ) ANANKE_UPS_TRIGGER_COUNT_1D = f"increase(ananke_shutdown_triggers_total{{{ANANKE_SELECTOR}}}[1d]) or on() vector(0)" GITOPS_SELECTOR = ANANKE_SELECTOR GITOPS_SOURCE_INFO = ( f'max by (branch, revision) (ananke_gitops_flux_source_info{{{GITOPS_SELECTOR},namespace="flux-system",name="flux-system"}})' ) GITOPS_KUSTOMIZATION_READY_PCT = ( f"100 * sum(max by (namespace, name) (ananke_gitops_kustomization_ready{{{GITOPS_SELECTOR}}})) " f"/ clamp_min(count(max by (namespace, name) (ananke_gitops_kustomization_ready{{{GITOPS_SELECTOR}}})), 1)" ) GITOPS_KUSTOMIZATION_READY_COUNT = ( f"sum(max by (namespace, name) (ananke_gitops_kustomization_ready{{{GITOPS_SELECTOR}}})) or on() vector(0)" ) GITOPS_KUSTOMIZATION_TOTAL_COUNT = ( f"count(max by (namespace, name) (ananke_gitops_kustomization_ready{{{GITOPS_SELECTOR}}})) or on() vector(0)" ) GITOPS_KUSTOMIZATION_SUSPENDED = ( f"sum(max by (namespace, name) (ananke_gitops_kustomization_suspended{{{GITOPS_SELECTOR}}})) or on() vector(0)" ) GITOPS_KUSTOMIZATION_NOT_SUSPENDED_PCT = ( f"100 * (1 - ({GITOPS_KUSTOMIZATION_SUSPENDED}) / clamp_min(({GITOPS_KUSTOMIZATION_TOTAL_COUNT}), 1))" ) GITOPS_HELM_READY_PCT = ( f"100 * sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{{{GITOPS_SELECTOR}}})) " f"/ clamp_min(count(max by (namespace, name) (ananke_gitops_helmrelease_ready{{{GITOPS_SELECTOR}}})), 1)" ) GITOPS_HELM_READY_COUNT = ( f"sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{{{GITOPS_SELECTOR}}})) or on() vector(0)" ) GITOPS_HELM_TOTAL_COUNT = ( f"count(max by (namespace, name) (ananke_gitops_helmrelease_ready{{{GITOPS_SELECTOR}}})) or on() vector(0)" ) GITOPS_HELM_SUSPENDED = ( f"sum(max by (namespace, name) (ananke_gitops_helmrelease_suspended{{{GITOPS_SELECTOR}}})) or on() vector(0)" ) GITOPS_HELM_NOT_SUSPENDED_PCT = ( f"100 * (1 - ({GITOPS_HELM_SUSPENDED}) / clamp_min(({GITOPS_HELM_TOTAL_COUNT}), 1))" ) GITOPS_SCRAPE_SUCCESS = f"min(ananke_gitops_scrape_success{{{GITOPS_SELECTOR}}}) or on() vector(0)" GITOPS_LAST_SCRAPE_AGE = ( f"(time() - max(ananke_gitops_last_scrape_timestamp_seconds{{{GITOPS_SELECTOR}}})) or on() vector(0)" ) ANANKE_UPS_RUNTIME_DB = ( f'max(ananke_ups_runtime_seconds{{{ANANKE_UPS_DB_SELECTOR}}}) or on() vector(0)' ) ANANKE_UPS_RUNTIME_TETHYS = ( f'max(ananke_ups_runtime_seconds{{{ANANKE_UPS_TETHYS_SELECTOR}}}) or on() vector(0)' ) ANANKE_UPS_ON_BATTERY_DB = ( f'max(ananke_ups_on_battery{{{ANANKE_UPS_DB_SELECTOR}}}) or on() vector(0)' ) ANANKE_UPS_ON_BATTERY_TETHYS = ( f'max(ananke_ups_on_battery{{{ANANKE_UPS_TETHYS_SELECTOR}}}) or on() vector(0)' ) ANANKE_UPS_BATTERY_CHARGE_DB = ( f'max(ananke_ups_battery_charge_percent{{{ANANKE_UPS_DB_SELECTOR}}}) or on() vector(0)' ) ANANKE_UPS_BATTERY_CHARGE_TETHYS = ( f'max(ananke_ups_battery_charge_percent{{{ANANKE_UPS_TETHYS_SELECTOR}}}) or on() vector(0)' ) ANANKE_UPS_LOAD_DB = ( f'max(ananke_ups_load_percent{{{ANANKE_UPS_DB_SELECTOR}}}) or on() vector(0)' ) ANANKE_UPS_LOAD_TETHYS = ( f'max(ananke_ups_load_percent{{{ANANKE_UPS_TETHYS_SELECTOR}}}) or on() vector(0)' ) ANANKE_UPS_DRAW_WATTS_DB = ( f'max((ananke_ups_load_percent{{{ANANKE_UPS_DB_SELECTOR}}} ' f'* ananke_ups_power_nominal_watts{{{ANANKE_UPS_DB_SELECTOR}}}) / 100) or on() vector(0)' ) ANANKE_UPS_DRAW_WATTS_TETHYS = ( f'max((ananke_ups_load_percent{{{ANANKE_UPS_TETHYS_SELECTOR}}} ' f'* ananke_ups_power_nominal_watts{{{ANANKE_UPS_TETHYS_SELECTOR}}}) / 100) or on() vector(0)' ) ANANKE_UPS_DRAW_WATTS_DB_SERIES = ( f'((ananke_ups_load_percent{{{ANANKE_UPS_DB_SELECTOR}}} ' f'* ananke_ups_power_nominal_watts{{{ANANKE_UPS_DB_SELECTOR}}}) / 100)' ) ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES = ( f'((ananke_ups_load_percent{{{ANANKE_UPS_TETHYS_SELECTOR}}} ' f'* ananke_ups_power_nominal_watts{{{ANANKE_UPS_TETHYS_SELECTOR}}}) / 100)' ) ANANKE_UPS_RUNTIME_BY_SOURCE = f"ananke_ups_runtime_seconds{{{ANANKE_SELECTOR}}}" ANANKE_UPS_LOAD_BY_SOURCE = f"ananke_ups_load_percent{{{ANANKE_SELECTOR}}}" ANANKE_UPS_CHARGE_BY_SOURCE = f"ananke_ups_battery_charge_percent{{{ANANKE_SELECTOR}}}" ANANKE_UPS_TRIGGER_BY_SOURCE = f"ananke_ups_trigger_active{{{ANANKE_SELECTOR}}}" CLIMATE_SENSOR_COUNT = "count(typhon_temperature_celsius) or on() vector(0)" CLIMATE_TEMP_MAX = "max(typhon_temperature_celsius) or on() vector(0)" CLIMATE_PRESSURE_CURRENT = "max(typhon_vpd_kpa) or on() vector(0)" CLIMATE_HUMIDITY_MAX = "max(typhon_relative_humidity_percent) or on() vector(0)" CLIMATE_TEMP_SERIES = "typhon_temperature_celsius" CLIMATE_PRESSURE_SERIES = "typhon_vpd_kpa" CLIMATE_HUMIDITY_SERIES = "typhon_relative_humidity_percent" CLIMATE_DEWPOINT_SERIES = ( "(243.12 * (ln(clamp_min(typhon_relative_humidity_percent, 1) / 100) + " "(17.62 * typhon_temperature_celsius) / (243.12 + typhon_temperature_celsius))) / " "(17.62 - (ln(clamp_min(typhon_relative_humidity_percent, 1) / 100) + " "(17.62 * typhon_temperature_celsius) / (243.12 + typhon_temperature_celsius)))" ) CLIMATE_DEWPOINT_CURRENT = f"max({CLIMATE_DEWPOINT_SERIES}) or on() vector(0)" CLIMATE_FAN_OUTLET_CURRENT = ( 'max(typhon_fan_speed_level{fan_group="outlet"}) or on() vector(0)' ) CLIMATE_FAN_INSIDE_INLET_CURRENT = ( 'max(typhon_fan_speed_level{fan_group="inside_inlet"}) or on() vector(0)' ) CLIMATE_FAN_OUTSIDE_INLET_CURRENT = ( 'max(typhon_fan_speed_level{fan_group="outside_inlet"}) or on() vector(0)' ) CLIMATE_FAN_INTERIOR_CURRENT = ( 'max(typhon_fan_speed_level{fan_group="interior"}) or on() vector(0)' ) CLIMATE_FAN_OUTLET_SERIES = ( 'typhon_fan_speed_level{fan_group="outlet"}' ) CLIMATE_FAN_INSIDE_INLET_SERIES = ( 'typhon_fan_speed_level{fan_group="inside_inlet"}' ) CLIMATE_FAN_OUTSIDE_INLET_SERIES = ( 'typhon_fan_speed_level{fan_group="outside_inlet"}' ) CLIMATE_FAN_INTERIOR_SERIES = ( 'typhon_fan_speed_level{fan_group="interior"}' ) POSTGRES_CONN_USED = ( 'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") ' 'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")' ) POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))' ONEOFF_JOB_OWNER = ( 'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")' ) ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})' ONEOFF_JOB_POD_AGE_HOURS = ( '((time() - kube_pod_start_time{pod!=""}) / 3600) ' f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} ' '* on(namespace,pod) group_left(phase) ' 'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})' ) GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" TRAEFIK_NET_INGRESS = ( 'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' " or on() vector(0)" ) TRAEFIK_NET_EGRESS = ( 'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' " or on() vector(0)" ) NET_CLUSTER_RX = ( 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))' " or on() vector(0)" ) NET_CLUSTER_TX = ( 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))' " or on() vector(0)" ) PHYSICAL_NET_FILTER = 'device!~"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*"' NET_NODE_RX_PHYS = ( f'sum(rate(node_network_receive_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)' ) NET_NODE_TX_PHYS = ( f'sum(rate(node_network_transmit_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)' ) NET_TOTAL_EXPR = NET_NODE_TX_PHYS NET_INGRESS_EXPR = NET_NODE_RX_PHYS NET_EGRESS_EXPR = NET_NODE_TX_PHYS NET_INTERNAL_EXPR = ( 'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!=""}[5m]) ' '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))' ' or on() vector(0)' ) APISERVER_5XX_RATE = 'sum(rate(apiserver_request_total{code=~"5.."}[5m]))' APISERVER_P99_LATENCY_MS = ( "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" ) ETCD_P99_LATENCY_MS = ( "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" ) TRAEFIK_TOTAL_5M = "sum(rate(traefik_entrypoint_requests_total[5m]))" TRAEFIK_SUCCESS_5M = 'sum(rate(traefik_entrypoint_requests_total{code!~"5.."}[5m]))' TRAEFIK_SLI_5M = f"({TRAEFIK_SUCCESS_5M}) / clamp_min({TRAEFIK_TOTAL_5M}, 1)" TRAEFIK_P99_LATENCY_MS = ( "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" ) TRAEFIK_P95_LATENCY_MS = ( "histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" ) SLO_AVAILABILITY = 0.999 def traefik_sli(window): total = f'sum(rate(traefik_entrypoint_requests_total[{window}]))' success = f'sum(rate(traefik_entrypoint_requests_total{{code!~"5.."}}[{window}]))' return f"({success}) / clamp_min({total}, 1)" def traefik_burn(window): sli = traefik_sli(window) return f"(1 - ({sli})) / {1 - SLO_AVAILABILITY}" # --------------------------------------------------------------------------- # Panel factories # --------------------------------------------------------------------------- def stat_panel( panel_id, title, expr, grid, *, unit="none", decimals=None, thresholds=None, text_mode="value", legend=None, instant=False, value_suffix=None, links=None, targets=None, field_overrides=None, description=None, orientation=None, wide_layout=None, ): """Return a Grafana stat panel definition.""" defaults = { "color": {"mode": "thresholds"}, "mappings": [], "thresholds": thresholds or { "mode": "absolute", "steps": [ {"color": "rgba(115, 115, 115, 1)", "value": None}, {"color": "green", "value": 1}, ], }, "unit": unit, "custom": {"displayMode": "auto"}, } if value_suffix: defaults["custom"]["valueSuffix"] = value_suffix if decimals is not None: defaults["decimals"] = decimals target_list = targets if targets is not None else [{"expr": expr, "refId": "A"}] panel = { "id": panel_id, "type": "stat", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": target_list, "fieldConfig": {"defaults": defaults, "overrides": field_overrides or []}, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "textMode": text_mode, }, } if orientation: panel["options"]["orientation"] = orientation if wide_layout is not None: panel["options"]["wideLayout"] = wide_layout if legend and len(panel["targets"]) == 1: panel["targets"][0]["legendFormat"] = legend if instant: for t in panel["targets"]: t.setdefault("instant", True) if links: panel["links"] = links if description: panel["description"] = description return panel def gauge_panel( panel_id, title, expr, grid, *, min_value=0, max_value=1, thresholds=None, links=None, ): return { "id": panel_id, "type": "gauge", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A"}], "fieldConfig": { "defaults": { "min": min_value, "max": max_value, "thresholds": thresholds or { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": max_value}, ], }, }, "overrides": [], }, "options": { "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "orientation": "auto", "showThresholdMarkers": False, "showThresholdLabels": False, }, **({"links": links} if links else {}), } def timeseries_panel( panel_id, title, expr, grid, *, unit="none", max_value=None, legend=None, legend_display="table", legend_placement="bottom", legend_calcs=None, time_from=None, links=None, targets=None, field_overrides=None, description=None, data_links=None, ): """Return a Grafana time-series panel definition.""" target_list = targets if targets is not None else [{"expr": expr, "refId": "A"}] panel = { "id": panel_id, "type": "timeseries", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": target_list, "fieldConfig": {"defaults": {"unit": unit}, "overrides": field_overrides or []}, "options": { "legend": { "displayMode": legend_display, "placement": legend_placement, }, "tooltip": {"mode": "multi"}, }, } if max_value is not None: panel["fieldConfig"]["defaults"]["max"] = max_value if legend and len(panel["targets"]) == 1: panel["targets"][0]["legendFormat"] = legend if legend_calcs: panel["options"]["legend"]["calcs"] = legend_calcs if time_from: panel["timeFrom"] = time_from if links: panel["links"] = links if data_links: panel["fieldConfig"]["defaults"]["links"] = data_links if description: panel["description"] = description return panel def state_timeline_panel( panel_id, title, expr, grid, *, description, thresholds, unit="percent", min_value=0, max_value=100, legend="{{suite}}", links=None, data_links=None, ): """Return a lane-style state timeline panel for categorical health over time.""" defaults = { "color": {"mode": "thresholds"}, "unit": unit, "thresholds": thresholds, "custom": { "fillOpacity": 70, "lineWidth": 0, "spanNulls": True, }, } if min_value is not None: defaults["min"] = min_value if max_value is not None: defaults["max"] = max_value panel = { "id": panel_id, "type": "state-timeline", "title": title, "description": description, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A", "legendFormat": legend}], "fieldConfig": {"defaults": defaults, "overrides": []}, "options": { "mergeValues": True, "showValue": "never", "legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "single", "sort": "none"}, }, } if links: panel["links"] = links if data_links: panel["fieldConfig"]["defaults"]["links"] = data_links return panel def apply_bar_timeseries_style(panel, *, stacked=False, fill_opacity=70): """Make a time-series panel read as volume bars instead of interpolated lines.""" panel["fieldConfig"]["defaults"]["custom"] = { "drawStyle": "bars", "barAlignment": 0, "barWidthFactor": 0.72, "lineWidth": 0, "fillOpacity": fill_opacity, "gradientMode": "none", "showPoints": "never", "spanNulls": True, } if stacked: panel["fieldConfig"]["defaults"]["custom"]["stacking"] = {"mode": "normal", "group": "A"} return panel def fixed_color_overrides(series_colors): """Return fixed-color overrides keyed by exact series name.""" return [ { "matcher": {"id": "byName", "options": series_name}, "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": color}}], } for series_name, color in series_colors.items() ] def table_panel( panel_id, title, expr, grid, *, unit="none", transformations=None, instant=False, options=None, filterable=True, footer=None, format=None, description=None, field_overrides=None, links=None, ): """Return a Grafana table panel definition.""" # Optional PromQL subquery helpers in expr: share(), etc. panel_options = {"showHeader": True, "columnFilters": False} if options: panel_options.update(options) if footer is not None: panel_options["footer"] = footer field_defaults = {"unit": unit, "custom": {"filterable": filterable}} target = {"expr": expr, "refId": "A", **({"instant": True} if instant else {})} if format: target["format"] = format panel = { "id": panel_id, "type": "table", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [target], "fieldConfig": {"defaults": field_defaults, "overrides": field_overrides or []}, "options": panel_options, } if transformations: panel["transformations"] = transformations if description: panel["description"] = description if links: panel["links"] = links return panel def pie_panel(panel_id, title, expr, grid, *, links=None, description=None): """Return a pie chart panel with readable namespace labels.""" panel = { "id": panel_id, "type": "piechart", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], "fieldConfig": { "defaults": { "unit": "percent", "color": {"mode": "palette-classic"}, }, "overrides": [], }, "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", "displayLabels": [], "tooltip": {"mode": "single"}, "colorScheme": "interpolateSpectral", "colorBy": "value", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, } if links: panel["links"] = links if description: panel["description"] = description return panel def namespace_scope_variable(var_name, label): options = [ { "text": "workload namespaces only", "value": NAMESPACE_SCOPE_WORKLOAD, "selected": True, }, {"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False}, { "text": "infrastructure namespaces only", "value": NAMESPACE_SCOPE_INFRA, "selected": False, }, ] query = ( "workload namespaces only : " + NAMESPACE_SCOPE_WORKLOAD + ",all namespaces : " + NAMESPACE_SCOPE_ALL + ",infrastructure namespaces only : " + NAMESPACE_SCOPE_INFRA ) return { "name": var_name, "label": label, "type": "custom", "query": query, "current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True}, "options": options, "hide": 2, "multi": False, "includeAll": False, "refresh": 1, "sort": 0, "skipUrlSync": False, } def namespace_scope_links(var_name): def with_value(value): encoded = urllib.parse.quote(value, safe="") params = [] for other in NAMESPACE_SCOPE_VARS: if other == var_name: params.append(f"var-{other}={encoded}") else: params.append(f"var-{other}=${{{other}}}") return "?" + "&".join(params) return [ {"title": "Workload namespaces only", "url": with_value(NAMESPACE_SCOPE_WORKLOAD), "targetBlank": False}, {"title": "All namespaces", "url": with_value(NAMESPACE_SCOPE_ALL), "targetBlank": False}, { "title": "Infrastructure namespaces only", "url": with_value(NAMESPACE_SCOPE_INFRA), "targetBlank": False, }, ] def testing_suite_variable(): options = [ { "text": suite, "value": suite, "selected": False, } for suite in PLATFORM_TEST_SUITE_NAMES ] query = ",".join(f"{suite} : {suite}" for suite in PLATFORM_TEST_SUITE_NAMES) return { "name": "suite", "label": "Suite", "type": "custom", "query": query, "current": {"text": "All", "value": "$__all", "selected": True}, "options": options, "hide": 0, "multi": False, "includeAll": True, "allValue": PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER, "refresh": 1, "sort": 1, "skipUrlSync": False, } def testing_case_variable(): return { "name": "test", "label": "Test Case", "type": "query", "query": f'label_values(platform_quality_gate_test_case_result{{suite=~"${{suite:regex}}",branch!="",branch=~"${{branch:regex}}",test!="",test!="__no_test_cases__",{PLATFORM_TEST_EXPORT_FILTER}}}, test)', "current": {"text": "All", "value": "$__all", "selected": True}, "options": [], "hide": 0, "multi": False, "includeAll": True, "allValue": ".*", "refresh": 2, "sort": 1, "skipUrlSync": False, } def testing_branch_variable(): return { "name": "branch", "label": "Branch", "type": "query", "query": f'label_values(platform_quality_gate_build_info{{suite=~"${{suite:regex}}",branch!="",{PLATFORM_TEST_EXPORT_FILTER}}}, branch)', "current": {"text": "All", "value": "$__all", "selected": True}, "options": [], "hide": 0, "multi": False, "includeAll": True, "allValue": ".*", "refresh": 2, "sort": 1, "skipUrlSync": False, } def jenkins_base_variable(): return { "name": "jenkins_base", "label": "Jenkins Base URL", "type": "textbox", "query": JENKINS_UI_BASE_DEFAULT, "current": { "text": JENKINS_UI_BASE_DEFAULT, "value": JENKINS_UI_BASE_DEFAULT, "selected": True, }, "hide": 0, "skipUrlSync": False, } def jenkins_suite_links(base_var="${jenkins_base}"): links = [{"title": "Open Jenkins", "url": f"{base_var}/", "targetBlank": True}] for suite in PLATFORM_TEST_SUITE_NAMES: job = PLATFORM_TEST_JENKINS_JOB_BY_SUITE.get(suite, suite) encoded_job = urllib.parse.quote(job, safe="") links.append( { "title": f"{suite}: Job", "url": f"{base_var}/job/{encoded_job}/", "targetBlank": True, } ) links.append( { "title": f"{suite}: Last Artifacts", "url": f"{base_var}/job/{encoded_job}/lastCompletedBuild/artifact/", "targetBlank": True, } ) return links def jenkins_artifact_data_links(base_var="${jenkins_base}"): return [ { "title": "Open build artifacts", "url": f"{base_var}/job/${{__field.labels.jenkins_job}}/${{__field.labels.build_number}}/artifact/", "targetBlank": True, }, { "title": "Open build", "url": f"{base_var}/job/${{__field.labels.jenkins_job}}/${{__field.labels.build_number}}/", "targetBlank": True, }, ] def jenkins_latest_artifact_data_links(base_var="${jenkins_base}"): return [ { "title": "Open latest artifacts", "url": f"{base_var}/job/${{__field.labels.jenkins_job}}/lastCompletedBuild/artifact/", "targetBlank": True, }, { "title": "Open Jenkins job", "url": f"{base_var}/job/${{__field.labels.jenkins_job}}/", "targetBlank": True, }, ] def bargauge_panel( panel_id, title, expr, grid, *, unit="none", legend=None, links=None, limit=None, sort_order="desc", thresholds=None, decimals=None, instant=False, overrides=None, data_links=None, include_color=True, description=None, ): """Return a bar gauge panel with label-aware reduction.""" cleaned_expr = expr.strip() if not cleaned_expr.startswith(("sort(", "sort_desc(")): if sort_order == "desc": expr = f"sort_desc({expr})" elif sort_order == "asc": expr = f"sort({expr})" defaults = {} if include_color: defaults["color"] = {"mode": "thresholds"} defaults.update( { "unit": unit, "min": 0, "max": 100 if unit == "percent" else None, "thresholds": thresholds or { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, {"color": "orange", "value": 70}, {"color": "red", "value": 85}, ], }, } ) panel = { "id": panel_id, "type": "bargauge", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [ { "expr": expr, "refId": "A", "legendFormat": legend or "{{node}}", **({"instant": True} if instant else {}), } ], "fieldConfig": { "defaults": defaults, "overrides": [], }, "options": { "displayMode": "basic", "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": False, }, }, } if overrides: panel["fieldConfig"]["overrides"].extend(overrides) if decimals is not None: panel["fieldConfig"]["defaults"]["decimals"] = decimals if links: panel["links"] = links if description: panel["description"] = description if data_links: panel["fieldConfig"]["defaults"]["links"] = data_links # Keep bars ordered by value descending for readability. panel["transformations"] = [ { "id": "sortBy", "options": {"fields": ["Value"], "order": sort_order}, } ] if limit: panel["transformations"].append({"id": "limit", "options": {"limit": limit}}) return panel def set_bargauge_display_mode(panels, display_mode): """Apply a display mode to bar gauges, including gauges inside collapsed rows.""" for panel in panels: if panel.get("type") == "bargauge": panel["options"]["displayMode"] = display_mode if panel.get("panels"): set_bargauge_display_mode(panel["panels"], display_mode) def text_panel(panel_id, title, content, grid): return { "id": panel_id, "type": "text", "title": title, "gridPos": grid, "datasource": None, "options": {"mode": "markdown", "content": content}, } def row_panel(panel_id, title, y, *, collapsed=True, panels=None): """Return a Grafana row, optionally carrying collapsed child panels.""" return { "id": panel_id, "type": "row", "title": title, "gridPos": {"h": 1, "w": 24, "x": 0, "y": y}, "collapsed": collapsed, **({"panels": panels or []} if collapsed else {}), } DASHBOARD_LINK_TITLES = { "atlas-overview": "Open Atlas Overview", "atlas-pods": "Open Atlas Pods", "atlas-nodes": "Open Atlas Nodes", "atlas-storage": "Open Atlas Storage", "atlas-network": "Open Atlas Network", "atlas-mail": "Open Atlas Mail", "atlas-jobs": "Atlas Testing", "atlas-testing": "Atlas Testing", "atlas-power": "Open Atlas Power", "atlas-gitops": "Open Atlas GitOps", "atlas-gpu": "Open Atlas GPU", } def link_to(uid): return [ { "title": DASHBOARD_LINK_TITLES.get(uid, f"Open {uid} dashboard"), "url": f"/d/{uid}", "targetBlank": True, } ] def overview_link_to(uid): """Return the historical Overview dashboard link label.""" return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}] # --------------------------------------------------------------------------- # Dashboard builders # --------------------------------------------------------------------------- def build_overview(): panels = [] overview_link = overview_link_to climate_drop_labels = "job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group" climate_temp_series = f"max without ({climate_drop_labels}) (typhon_temperature_celsius != 0)" climate_humidity_series = f"max without ({climate_drop_labels}) (typhon_relative_humidity_percent != 0)" climate_pressure_series = f"max without ({climate_drop_labels}) (typhon_vpd_kpa != 0)" overview_pvc_backup_metric_presence = ( 'count({__name__=~"pvc_backup_(count|last_success_timestamp_seconds|health_reason)",driver="restic"})' ) overview_pvc_backup_missing = ( 'label_replace(label_replace(vector(999), "namespace", "maintenance", "__name__", ".*"), ' '"pvc", "backup-telemetry-missing", "__name__", ".*")' ) overview_pvc_backup_age = ( 'max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver="restic"}) / 3600) ' 'or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver="restic",reason=~"missing|no_completed|lookup_failed|unknown_timestamp"} > 0) ' f'* (pvc_backup_count{{driver="restic"}} > bool 0)) * 999))) or on() ' f'(({overview_pvc_backup_missing}) unless on() (({overview_pvc_backup_metric_presence}) > 0))' ) def overview_metric_pair_expr(first_expr, first_name, second_expr, second_name): return ( f'label_replace({first_expr}, "metric", "{first_name}", "__name__", ".*") ' f'or label_replace({second_expr}, "metric", "{second_name}", "__name__", ".*")' ) def overview_platform_test_success_targets(): suites = [ ("ariadne", "ariadne"), ("metis", "metis"), ("ananke", "ananke"), ("atlasbot", "atlasbot"), ("lesavka", "lesavka"), ("pegasus", "pegasus|pegasus-health|pegasus_health"), ("soteria", "soteria"), ("titan-iac", "titan-iac|titan_iac"), ("bstein-home", "bstein-home|bstein_home"), ("arcanagon", "arcanagon"), ("data-prepper", "data-prepper|data_prepper"), ] targets = [] for index, (legend, suite_regex) in enumerate(suites): total = f'sum(increase(platform_quality_gate_runs_total{{suite=~"{suite_regex}"}}[1h]))' passed = ( f'sum(increase(platform_quality_gate_runs_total{{suite=~"{suite_regex}",' f'status=~"{PLATFORM_TEST_SUCCESS_STATUS}"}}[1h]))' ) targets.append( { "refId": chr(ord("A") + index), "expr": f"(100 * ({passed}) / clamp_min(({total}), 1)) and on() (({total}) > 0) or on() vector(0)", "legendFormat": legend, } ) return targets age_thresholds = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 6}, {"color": "orange", "value": 24}, {"color": "red", "value": 48}, ], } row1_stats = [ { "id": 2, "title": "Control Plane Ready", "expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', "kind": "gauge", "max_value": CONTROL_TOTAL, "thresholds": { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "green", "value": CONTROL_TOTAL}, ], }, }, { "id": 3, "title": "Control Plane Workloads", "expr": CONTROL_WORKLOADS_EXPR, "kind": "stat", "thresholds": { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, ], }, "links": overview_link("atlas-pods"), }, { "id": 5, "title": "Stuck Terminating", "expr": STUCK_TERMINATING_EXPR, "kind": "stat", "thresholds": { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, ], }, "links": overview_link("atlas-pods"), }, { "id": 27, "title": "Atlas Availability (365d)", "expr": UPTIME_PERCENT_EXPR, "kind": "stat", "thresholds": UPTIME_PERCENT_THRESHOLDS, "unit": "percentunit", "decimals": 4, "text_mode": "value", "instant": True, "description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Grafana keeps the last successful rollup for up to 24h so one missed long-window evaluation does not render as No data.", }, { "id": 4, "title": "Problem Pods", "expr": PROBLEM_PODS_EXPR, "kind": "stat", "thresholds": { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, ], }, "links": overview_link("atlas-pods"), }, { "id": 6, "title": "CrashLoop / ImagePull", "expr": CRASHLOOP_EXPR, "kind": "stat", "thresholds": { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, ], }, "links": overview_link("atlas-pods"), }, { "id": 1, "title": "Workers Ready", "expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', "kind": "gauge", "max_value": WORKER_TOTAL, "thresholds": { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": WORKER_TOTAL - 2}, {"color": "yellow", "value": WORKER_TOTAL - 1}, {"color": "green", "value": WORKER_TOTAL}, ], }, }, ] def gauge_grid(idx): width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4 x = sum(GAUGE_WIDTHS[:idx]) return width, x for idx, item in enumerate(row1_stats): panel_id = item["id"] width, x = gauge_grid(idx) grid = {"h": 5, "w": width, "x": x, "y": 0} kind = item.get("kind", "gauge") if kind == "stat": panels.append( stat_panel( panel_id, item["title"], item["expr"], grid, thresholds=item.get("thresholds"), legend=None, links=item.get("links"), text_mode=item.get("text_mode", "value"), value_suffix=item.get("value_suffix"), unit=item.get("unit", "none"), decimals=item.get("decimals"), instant=item.get("instant", False), description=item.get("description"), ) ) else: panels.append( gauge_panel( panel_id, item["title"], item["expr"], grid, min_value=0, max_value=item.get("max_value", 5), thresholds=item.get("thresholds"), links=item.get("links"), ) ) top_health_panels = [ (7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"), (8, "Hottest node: RAM", topk_with_node(node_mem_expr()), "percent"), (9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"), (10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"), (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"), (25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"), (26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"), ] for idx, (panel_id, title, expr, unit) in enumerate(top_health_panels): is_hottest_panel = panel_id in {7, 8, 9, 10} panels.append( stat_panel( panel_id, title, f"{expr}", {"h": 2, "w": 3, "x": 3 * idx, "y": 5}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="name_and_value" if is_hottest_panel else "value", legend="{{node}}" if is_hottest_panel else None, instant=is_hottest_panel, links=overview_link("atlas-storage" if panel_id in {23, 24, 25, 26} else "atlas-nodes"), ) ) mail_bounce_rate_thresholds = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 5}, {"color": "orange", "value": 8}, {"color": "red", "value": 10}, ], } mail_limit_thresholds = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 70}, {"color": "orange", "value": 85}, {"color": "red", "value": 95}, ], } mail_success_thresholds = { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": 90}, {"color": "yellow", "value": 95}, {"color": "green", "value": 98}, ], } dark_red = "dark-red" dark_orange = "dark-orange" dark_yellow = "dark-yellow" dark_green = "dark-green" dark_blue = "dark-blue" test_success_thresholds = { "mode": "absolute", "steps": [ {"color": dark_red, "value": None}, {"color": dark_orange, "value": 70}, {"color": dark_yellow, "value": 85}, {"color": dark_green, "value": 95}, {"color": dark_blue, "value": 100}, ], } fan_intensity_thresholds = { "mode": "absolute", "steps": [ {"color": "#1f60c4", "value": None}, {"color": "#2870b8", "value": 1}, {"color": "#2f8599", "value": 2}, {"color": "#2f9e44", "value": 3}, {"color": "#76a935", "value": 4}, {"color": "#d4b106", "value": 5}, {"color": "#d69605", "value": 6}, {"color": "#e06c00", "value": 7}, {"color": "#d95718", "value": 8}, {"color": "#c92a2a", "value": 9}, {"color": "#8f1d1d", "value": 10}, ], } fan_intensity_expr = ( f'label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="1"}}), "fan", "Outlet", "__name__", ".*") ' f'or label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="2"}}), "fan", "Inlet - Inside", "__name__", ".*") ' f'or label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="3"}}), "fan", "Inlet - Outside", "__name__", ".*") ' f'or label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="4"}}), "fan", "Interior", "__name__", ".*")' ) gitops_health_history_expr = ( f'label_replace({GITOPS_KUSTOMIZATION_READY_PCT}, "signal", "Kustomizations Ready", "__name__", ".*") ' f'or label_replace({GITOPS_HELM_READY_PCT}, "signal", "HelmReleases Ready", "__name__", ".*") ' f'or label_replace({GITOPS_KUSTOMIZATION_NOT_SUSPENDED_PCT}, "signal", "Kustomizations Not Suspended", "__name__", ".*") ' f'or label_replace({GITOPS_HELM_NOT_SUSPENDED_PCT}, "signal", "HelmReleases Not Suspended", "__name__", ".*")' ) compact_current_text = {"titleSize": 11, "valueSize": 20} perfect_count_thresholds = { "mode": "absolute", "steps": [ {"color": dark_red, "value": None}, {"color": dark_yellow, "value": max(len(PLATFORM_TEST_SUITE_NAMES) - 2, 1)}, {"color": dark_green, "value": len(PLATFORM_TEST_SUITE_NAMES) - 1}, {"color": dark_blue, "value": len(PLATFORM_TEST_SUITE_NAMES)}, ], } failure_count_thresholds = { "mode": "absolute", "steps": [ {"color": dark_blue, "value": None}, {"color": dark_yellow, "value": 1}, {"color": dark_orange, "value": 3}, {"color": dark_red, "value": 5}, ], } overview_avg_coverage = f"(avg(({QUALITY_GATE_COVERAGE_BY_SUITE})) or on() vector(0))" overview_loc_clean_suites = f"(sum(({QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE}) == bool 0) or on() vector(0))" for panel_id, title, draw_expr, runtime_expr, y_pos in [ (40, "Pyrphoros UPS Current", ANANKE_UPS_DRAW_WATTS_DB, ANANKE_UPS_RUNTIME_DB, 7), (144, "Statera UPS Current", ANANKE_UPS_DRAW_WATTS_TETHYS, ANANKE_UPS_RUNTIME_TETHYS, 10), ]: panel = stat_panel( panel_id, title, None, {"h": 3, "w": 3, "x": 0, "y": y_pos}, unit="none", text_mode="name_and_value", targets=[ { "expr": overview_metric_pair_expr(draw_expr, "Draw", runtime_expr, "Runtime"), "refId": "A", "legendFormat": "{{metric}}", "instant": True, } ], field_overrides=[ {"matcher": {"id": "byName", "options": "Draw"}, "properties": [{"id": "unit", "value": "watt"}]}, {"matcher": {"id": "byName", "options": "Runtime"}, "properties": [{"id": "unit", "value": "s"}]}, ], links=overview_link("atlas-power"), ) panel["options"]["text"] = compact_current_text panels.append(panel) ups_history = timeseries_panel( 41, "UPS History (Power Draw)", None, {"h": 6, "w": 6, "x": 3, "y": 7}, unit="watt", targets=[ {"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB_SERIES, "legendFormat": ANANKE_UPS_DB_NAME}, {"refId": "B", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES, "legendFormat": ANANKE_UPS_TETHYS_NAME}, ], field_overrides=fixed_color_overrides( {ANANKE_UPS_DB_NAME: dark_blue, ANANKE_UPS_TETHYS_NAME: dark_yellow} ), legend_display="list", legend_placement="bottom", links=overview_link("atlas-power"), ) ups_history["fieldConfig"]["defaults"]["custom"] = { "drawStyle": "line", "lineInterpolation": "linear", "lineWidth": 2, "fillOpacity": 18, "showPoints": "never", "spanNulls": True, } panels.append(ups_history) temp_panel = stat_panel( 42, "Current Enclosure Temperature", None, {"h": 3, "w": 3, "x": 0, "y": 13}, unit="none", text_mode="name_and_value", targets=[ { "expr": overview_metric_pair_expr( f"max({climate_temp_series}) or on() vector(0)", "°C", f"max(({climate_temp_series}) * 9 / 5 + 32) or on() vector(0)", "°F", ), "refId": "A", "legendFormat": "{{metric}}", "instant": True, } ], field_overrides=[ {"matcher": {"id": "byName", "options": "°C"}, "properties": [{"id": "unit", "value": "celsius"}]}, {"matcher": {"id": "byName", "options": "°F"}, "properties": [{"id": "unit", "value": "fahrenheit"}]}, ], links=overview_link("atlas-power"), ) temp_panel["options"]["text"] = compact_current_text panels.append(temp_panel) climate_panel = stat_panel( 143, "Current Enclosure Climate", None, {"h": 3, "w": 3, "x": 0, "y": 16}, unit="none", text_mode="name_and_value", targets=[ { "expr": overview_metric_pair_expr( f"max({climate_humidity_series}) or on() vector(0)", "%RH", f"max({climate_pressure_series}) or on() vector(0)", "kPa", ), "refId": "A", "legendFormat": "{{metric}}", "instant": True, } ], field_overrides=[ {"matcher": {"id": "byName", "options": "%RH"}, "properties": [{"id": "unit", "value": "suffix:%RH"}]}, {"matcher": {"id": "byName", "options": "kPa"}, "properties": [{"id": "unit", "value": "suffix:kPa"}]}, ], links=overview_link("atlas-power"), ) climate_panel["options"]["text"] = compact_current_text panels.append(climate_panel) panels.append( timeseries_panel( 43, "Enclosure Climate History", None, {"h": 6, "w": 6, "x": 3, "y": 13}, unit="none", targets=[ {"refId": "A", "expr": climate_temp_series, "legendFormat": "C"}, {"refId": "B", "expr": climate_humidity_series, "legendFormat": "RH"}, {"refId": "C", "expr": climate_pressure_series, "legendFormat": "P"}, {"refId": "D", "expr": f"(min_over_time({climate_temp_series}[$__range]) - 0.08)", "legendFormat": "C bound min"}, {"refId": "E", "expr": f"(max_over_time({climate_temp_series}[$__range]) + 0.08)", "legendFormat": "C bound max"}, {"refId": "F", "expr": f"clamp_min((min_over_time({climate_humidity_series}[$__range]) - 0.35), 0)", "legendFormat": "RH bound min"}, {"refId": "G", "expr": f"clamp_max((max_over_time({climate_humidity_series}[$__range]) + 0.35), 100)", "legendFormat": "RH bound max"}, {"refId": "H", "expr": f"clamp_min((min_over_time({climate_pressure_series}[$__range]) - 0.03), 0)", "legendFormat": "P bound min"}, {"refId": "I", "expr": f"(max_over_time({climate_pressure_series}[$__range]) + 0.03)", "legendFormat": "P bound max"}, ], field_overrides=[ { "matcher": {"id": "byName", "options": "C"}, "properties": [ {"id": "unit", "value": "suffix:°C"}, {"id": "decimals", "value": 2}, {"id": "custom.axisPlacement", "value": "left"}, {"id": "custom.axisCenteredZero", "value": False}, ], }, { "matcher": {"id": "byRegexp", "options": "C bound .*"}, "properties": [ {"id": "unit", "value": "suffix:°C"}, {"id": "custom.axisPlacement", "value": "left"}, {"id": "custom.axisCenteredZero", "value": False}, {"id": "custom.hideFrom", "value": {"legend": True, "tooltip": True, "viz": False}}, {"id": "custom.lineWidth", "value": 0}, {"id": "custom.fillOpacity", "value": 0}, {"id": "custom.showPoints", "value": "never"}, {"id": "color", "value": {"mode": "fixed", "fixedColor": "transparent"}}, ], }, { "matcher": {"id": "byName", "options": "RH"}, "properties": [ {"id": "unit", "value": "suffix:%"}, {"id": "decimals", "value": 2}, {"id": "custom.axisPlacement", "value": "right"}, {"id": "custom.axisCenteredZero", "value": False}, ], }, { "matcher": {"id": "byRegexp", "options": "RH bound .*"}, "properties": [ {"id": "unit", "value": "suffix:%"}, {"id": "custom.axisPlacement", "value": "right"}, {"id": "custom.axisCenteredZero", "value": False}, {"id": "custom.hideFrom", "value": {"legend": True, "tooltip": True, "viz": False}}, {"id": "custom.lineWidth", "value": 0}, {"id": "custom.fillOpacity", "value": 0}, {"id": "custom.showPoints", "value": "never"}, {"id": "color", "value": {"mode": "fixed", "fixedColor": "transparent"}}, ], }, { "matcher": {"id": "byName", "options": "P"}, "properties": [ {"id": "unit", "value": "suffix:kPa"}, {"id": "custom.axisPlacement", "value": "right"}, {"id": "decimals", "value": 2}, {"id": "custom.axisCenteredZero", "value": False}, ], }, { "matcher": {"id": "byRegexp", "options": "P bound .*"}, "properties": [ {"id": "unit", "value": "suffix:kPa"}, {"id": "custom.axisPlacement", "value": "right"}, {"id": "custom.axisCenteredZero", "value": False}, {"id": "custom.hideFrom", "value": {"legend": True, "tooltip": True, "viz": False}}, {"id": "custom.lineWidth", "value": 0}, {"id": "custom.fillOpacity", "value": 0}, {"id": "custom.showPoints", "value": "never"}, {"id": "color", "value": {"mode": "fixed", "fixedColor": "transparent"}}, ], }, ], legend_display="list", legend_placement="bottom", links=overview_link("atlas-power"), description="Temperature on left axis, humidity and pressure on right axis with dynamic bound series so small swings remain visible.", ) ) panels[-1]["fieldConfig"]["defaults"]["custom"] = { "drawStyle": "line", "lineInterpolation": "linear", "lineWidth": 2, "fillOpacity": 10, "showPoints": "never", "spanNulls": True, } fan_panel = state_timeline_panel( 141, "Fan Intensity History", fan_intensity_expr, {"h": 6, "w": 6, "x": 9, "y": 13}, unit="none", min_value=0, max_value=10, legend="{{fan}}", thresholds=fan_intensity_thresholds, links=overview_link("atlas-power"), description="Fan intensity lanes on the 0-10 controller scale. Cooler colors are quiet/low intensity; warmer colors mean the enclosure is pushing harder.", ) fan_panel["options"]["legend"] = {"displayMode": "list", "placement": "bottom"} fan_panel["options"]["mergeValues"] = False fan_panel["options"]["showValue"] = "auto" fan_panel["options"]["tooltip"] = {"mode": "multi", "sort": "none"} panels.append(fan_panel) flux_source = stat_panel( 140, "Flux Source", None, {"h": 2, "w": 3, "x": 21, "y": 7}, unit="none", text_mode="name", thresholds={ "mode": "absolute", "steps": [ {"color": dark_red, "value": None}, {"color": dark_blue, "value": 1}, ], }, targets=[ { "expr": f"{GITOPS_SOURCE_INFO} or on() vector(0)", "refId": "A", "legendFormat": "{{branch}}", "instant": True, } ], links=overview_link("atlas-gitops"), description="Flux GitRepository branch reported by Ananke. Revision and object detail live in Atlas GitOps.", ) flux_source["options"]["graphMode"] = "none" flux_source["options"]["text"] = {"titleSize": 10, "valueSize": 14} panels.append(flux_source) for panel_id, title, expr, y_pos, unit, decimals, thresholds, links in [ (151, "Run Reliability (24h)", TEST_SUCCESS_RATE_24H, 9, "percent", 1, test_success_thresholds, "atlas-testing"), (152, "Failed Runs (24h)", TEST_FAILURES_24H_TOTAL, 11, "none", 0, failure_count_thresholds, "atlas-testing"), (153, "Fresh Suites (24h)", PLATFORM_TEST_ACTIVE_SUITES_24H, 13, "none", 0, perfect_count_thresholds, "atlas-testing"), (154, "Avg Coverage", overview_avg_coverage, 15, "percent", 1, test_success_thresholds, "atlas-testing"), (155, "LOC Clean Suites", overview_loc_clean_suites, 17, "none", 0, perfect_count_thresholds, "atlas-testing"), ]: rail_panel = stat_panel( panel_id, title, expr, {"h": 2, "w": 3, "x": 21, "y": y_pos}, unit=unit, decimals=decimals, instant=True, thresholds=thresholds, links=overview_link(links), ) rail_panel["options"]["graphMode"] = "none" rail_panel["options"]["text"] = {"titleSize": 10, "valueSize": 19} panels.append(rail_panel) panels.append( state_timeline_panel( 150, "GitOps Health", gitops_health_history_expr, {"h": 6, "w": 6, "x": 15, "y": 7}, unit="percent", min_value=0, max_value=100, legend="{{signal}}", thresholds=test_success_thresholds, links=overview_link("atlas-gitops"), description="GitOps readiness and suspension health over time. Blue means perfect; warmer colors mean a readiness or suspension problem appeared.", ) ) panels.append( bargauge_panel( 44, "One-off Job Pods (age hours)", ONEOFF_JOB_POD_AGE_HOURS, {"h": 5, "w": 8, "x": 0, "y": 32}, unit="h", instant=True, legend="{{namespace}}/{{pod}}", thresholds=age_thresholds, limit=12, decimals=2, links=overview_link("atlas-testing"), include_color=False, ) ) ariadne_volume = timeseries_panel( 45, "Ariadne Run Volume", None, {"h": 6, "w": 6, "x": 9, "y": 7}, unit="none", targets=[ {"expr": f"{ARIADNE_TASK_ATTEMPTS_SERIES} or on() vector(0)", "refId": "A", "legendFormat": "Attempts"}, {"expr": f"{ARIADNE_TASK_FAILURES_SERIES} or on() vector(0)", "refId": "B", "legendFormat": "Failures"}, ], legend_display="list", legend_placement="bottom", links=overview_link("atlas-testing"), ) ariadne_volume["fieldConfig"]["overrides"] = fixed_color_overrides( {"Attempts": dark_blue, "Failures": dark_red} ) panels.append(apply_bar_timeseries_style(ariadne_volume, stacked=False)) panels.append( state_timeline_panel( 46, "Gate Checks Passing by Suite", PLATFORM_TEST_CURRENT_GATE_HEALTH_BY_SUITE, {"h": 6, "w": 6, "x": 15, "y": 13}, unit="percent", min_value=0, max_value=100, legend="{{suite}}", thresholds=test_success_thresholds, links=overview_link("atlas-testing"), description="Percent of current gate dimensions passing per suite over time. There are seven gate dimensions, so 85.7% means one gate is failing.", ) ) panels[-1]["options"]["legend"] = {"displayMode": "hidden", "placement": "bottom"} panels[-1]["options"]["mergeValues"] = False panels[-1]["options"]["showValue"] = "auto" for panel_id, title, metric, x_pos, description in [ ( 142, "Jenkins Last Success (h, newest first)", "ariadne_jenkins_build_weather_job_last_success_timestamp_seconds", 8, "Top 6 most recent Jenkins successes by age (newest first). Green means last run succeeded; red means last run did not succeed. Use Atlas Jobs for the full list.", ), ( 243, "Jenkins Last Failure (h, newest first)", "ariadne_jenkins_build_weather_job_last_failure_timestamp_seconds", 12, "Top 6 most recent Jenkins failures by age (newest first). Green means last run succeeded; red means last run did not succeed. Use Atlas Jobs for the full list.", ), ]: base_expr = f"min by (exported_job,job_url,weather_icon) ((time() - {metric}) / 3600)" topk_expr = f"sort(bottomk(6, {base_expr}))" success_expr = ( f'label_replace(({topk_expr}) and on(exported_job,job_url,weather_icon) ' '(max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) == 1), ' '"run_state", "ok", "exported_job", ".*")' ) failure_expr = ( f'label_replace(({topk_expr}) and on(exported_job,job_url,weather_icon) ' '(max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) != 1), ' '"run_state", "bad", "exported_job", ".*")' ) panels.append( { "id": panel_id, "type": "stat", "title": title, "datasource": PROM_DS, "gridPos": {"h": 5, "w": 4, "x": x_pos, "y": 32}, "targets": [ { "refId": "A", "expr": f"sort(({success_expr}) or ({failure_expr}))", "instant": True, } ], "fieldConfig": { "defaults": { "unit": "h", "decimals": 1, "min": 0, "displayName": "${__field.labels.weather_icon} ${__field.labels.exported_job}", "links": [ { "title": "Open Jenkins job", "url": "https://ci.bstein.dev/job/${__field.labels.exported_job}/", "targetBlank": True, } ], }, "overrides": [ { "matcher": {"id": "byRegexp", "options": '.*run_state="ok".*'}, "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}], }, { "matcher": {"id": "byRegexp", "options": '.*run_state="bad".*'}, "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}], }, ], }, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "left", "orientation": "horizontal", "wideLayout": True, "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "textMode": "name_and_value", "text": {"titleSize": 11, "valueSize": 11}, }, "transformations": [{"id": "sortBy", "options": {"fields": ["Value"], "order": "asc"}}], "links": overview_link("atlas-testing"), "description": description, } ) panels.append( bargauge_panel( 47, "PVC Backup Health / Age", overview_pvc_backup_age, {"h": 5, "w": 8, "x": 16, "y": 32}, unit="h", instant=True, legend="{{namespace}}/{{pvc}}", sort_order="desc", thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 20}, {"color": "orange", "value": 40}, {"color": "red", "value": 50}, ], }, include_color=False, ) ) panels[-1]["links"] = overview_link("atlas-storage") panels[-1]["description"] = ( "Backup age in hours computed from last-success timestamps for restic-managed PVCs (nightly target: <=20h green, <40h yellow, <50h orange, >=50h red). PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility." ) panels.append( stat_panel( 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', {"h": 2, "w": 4, "x": 0, "y": 19}, unit="none", links=overview_link("atlas-mail"), ) ) panels.append( { "id": 31, "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, "gridPos": {"h": 2, "w": 4, "x": 8, "y": 19}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', "refId": "A", "legendFormat": "Rate", }, { "expr": 'max(postmark_outbound_bounced{window="1d"})', "refId": "B", "legendFormat": "Count", }, ], "fieldConfig": { "defaults": { "color": {"mode": "thresholds"}, "custom": {"displayMode": "auto"}, "thresholds": mail_bounce_rate_thresholds, "unit": "none", }, "overrides": [ { "matcher": {"id": "byName", "options": "Rate"}, "properties": [{"id": "unit", "value": "percent"}], }, { "matcher": {"id": "byName", "options": "Count"}, "properties": [{"id": "unit", "value": "none"}], }, ], }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "textMode": "name_and_value", }, "links": overview_link("atlas-mail"), } ) panels.append( stat_panel( 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', {"h": 2, "w": 4, "x": 4, "y": 19}, unit="percent", thresholds=mail_success_thresholds, decimals=1, links=overview_link("atlas-mail"), ) ) panels.append( stat_panel( 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", {"h": 2, "w": 4, "x": 12, "y": 19}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, links=overview_link("atlas-mail"), ) ) panels.append( stat_panel( 34, "Postgres Connections Used", POSTGRES_CONN_USED, {"h": 2, "w": 4, "x": 16, "y": 19}, decimals=0, text_mode="name_and_value", legend="{{conn}}", instant=True, ) ) panels.append( stat_panel( 35, "Postgres Hottest Connections", POSTGRES_CONN_HOTTEST, {"h": 2, "w": 4, "x": 20, "y": 19}, unit="none", decimals=0, text_mode="name_and_value", legend="{{datname}}", instant=True, ) ) cpu_scope = "$namespace_scope_cpu" gpu_scope = "$namespace_scope_gpu" ram_scope = "$namespace_scope_ram" panels.append( pie_panel( 11, "Namespace CPU Share", namespace_cpu_share_expr(cpu_scope), {"h": 9, "w": 8, "x": 0, "y": 23}, links=namespace_scope_links("namespace_scope_cpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) ) panels.append( pie_panel( 12, "Namespace GPU Share", namespace_gpu_share_expr(gpu_scope), {"h": 9, "w": 8, "x": 8, "y": 23}, links=namespace_scope_links("namespace_scope_gpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) ) panels.append( pie_panel( 13, "Namespace RAM Share", namespace_ram_share_expr(ram_scope), {"h": 9, "w": 8, "x": 16, "y": 23}, links=namespace_scope_links("namespace_scope_ram"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) ) worker_filter = f"{WORKER_REGEX}" panels.append( timeseries_panel( 14, "Worker Node CPU", node_cpu_expr(worker_filter), {"h": 12, "w": 12, "x": 0, "y": 44}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", links=overview_link("atlas-nodes"), ) ) panels.append( timeseries_panel( 15, "Worker Node RAM", node_mem_expr(worker_filter), {"h": 12, "w": 12, "x": 12, "y": 44}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", links=overview_link("atlas-nodes"), ) ) panels.append( timeseries_panel( 16, "Control plane CPU", node_cpu_expr(CONTROL_ALL_REGEX), {"h": 10, "w": 12, "x": 0, "y": 56}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 17, "Control plane RAM", node_mem_expr(CONTROL_ALL_REGEX), {"h": 10, "w": 12, "x": 12, "y": 56}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( pie_panel( 28, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', {"h": 10, "w": 12, "x": 0, "y": 66}, ) ) panels.append( bargauge_panel( 29, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', {"h": 10, "w": 12, "x": 12, "y": 66}, unit="none", limit=12, decimals=0, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, {"color": "orange", "value": 75}, {"color": "red", "value": 100}, ], }, instant=True, include_color=False, ) ) panels.append( timeseries_panel( 18, "Cluster Ingress Throughput", NET_INGRESS_EXPR, {"h": 7, "w": 8, "x": 0, "y": 37}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", legend_placement="bottom", links=overview_link("atlas-network"), ) ) panels.append( timeseries_panel( 19, "Cluster Egress Throughput", NET_EGRESS_EXPR, {"h": 7, "w": 8, "x": 8, "y": 37}, unit="Bps", legend="Egress (Traefik)", legend_display="list", legend_placement="bottom", links=overview_link("atlas-network"), ) ) panels.append( timeseries_panel( 20, "Intra-Cluster Throughput", NET_INTERNAL_EXPR, {"h": 7, "w": 8, "x": 16, "y": 37}, unit="Bps", legend="Internal traffic", legend_display="list", legend_placement="bottom", links=overview_link("atlas-network"), ) ) panels.append( timeseries_panel( 21, "Root Filesystem Usage", root_usage_expr(), {"h": 16, "w": 12, "x": 0, "y": 76}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", time_from="30d", links=overview_link("atlas-storage"), ) ) panels.append( timeseries_panel( 22, "Nodes Closest to Full Astraios Disks", astraios_usage_expr(), {"h": 16, "w": 12, "x": 12, "y": 76}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", time_from="1w", links=overview_link("atlas-storage"), ) ) return { "uid": "atlas-overview", "title": "Atlas Overview", "folderUid": PUBLIC_FOLDER, "editable": False, "annotations": {"list": []}, "panels": panels, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "overview"], "templating": { "list": [ namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"), namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"), namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"), ] }, "time": {"from": "now-1h", "to": "now"}, "refresh": "1m", "links": link_to("atlas-testing"), } def build_pods_dashboard(): panels = [] panels.append( stat_panel( 1, "Problem Pods", PROBLEM_PODS_EXPR, {"h": 4, "w": 6, "x": 0, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": 1}, ], }, ) ) panels.append( stat_panel( 2, "CrashLoop / ImagePull", CRASHLOOP_EXPR, {"h": 4, "w": 6, "x": 6, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": 1}, ], }, ) ) panels.append( stat_panel( 3, "Stuck Terminating (>10m)", STUCK_TERMINATING_EXPR, {"h": 4, "w": 6, "x": 12, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": 1}, ], }, ) ) panels.append( stat_panel( 4, "Control Plane Workloads", f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', {"h": 4, "w": 6, "x": 18, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": 1}, ], }, ) ) panels.append( table_panel( 5, "Pods Not Running", PROBLEM_TABLE_EXPR, {"h": 10, "w": 24, "x": 0, "y": 4}, unit="s", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 6, "CrashLoop / ImagePull", CRASHLOOP_TABLE_EXPR, {"h": 10, "w": 24, "x": 0, "y": 14}, unit="s", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 7, "Terminating >10m", STUCK_TABLE_EXPR, {"h": 10, "w": 24, "x": 0, "y": 24}, unit="s", transformations=[ {"id": "labelsToFields", "options": {}}, {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, ], ) ) panels.append( pie_panel( 8, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', {"h": 8, "w": 12, "x": 12, "y": 34}, ) ) panels.append( bargauge_panel( 9, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', {"h": 8, "w": 12, "x": 0, "y": 34}, unit="none", limit=12, decimals=0, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, {"color": "orange", "value": 75}, {"color": "red", "value": 100}, ], }, instant=True, ) ) share_expr = ( '(sum by (namespace,node) (kube_pod_info{pod!="" , node!=""}) ' '/ on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=""}), 1) * 100)' ) rank_terms = [ f"(sum by (node) (kube_node_info{{node=\"{node}\"}}) * 0 + {idx * 1e-3})" for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1) ] rank_expr = " or ".join(rank_terms) score_expr = f"{share_expr} + on(node) group_left() ({rank_expr})" mask_expr = ( f"{score_expr} == bool on(namespace) group_left() " f"(max by (namespace) ({score_expr}))" ) panels.append( table_panel( 10, "Namespace Plurality by Node v27", ( f"{share_expr} * on(namespace,node) group_left() " f"({mask_expr})" ), {"h": 8, "w": 24, "x": 0, "y": 42}, unit="percent", transformations=[ {"id": "labelsToFields", "options": {}}, {"id": "organize", "options": {"excludeByName": {"Time": True}}}, {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 0}}, { "id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}, }, { "id": "groupBy", "options": { "fields": { "namespace": { "aggregations": [ {"field": "Value", "operation": "max"}, {"field": "node", "operation": "first"}, ] } }, "rowBy": ["namespace"], }, }, ], instant=True, options={"showColumnFilters": False}, filterable=False, footer={"show": False, "fields": "", "calcs": []}, format="table", ) ) return { "uid": "atlas-pods", "title": "Atlas Pods", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "pods"], } def build_nodes_dashboard(): panels = [] panels.append( stat_panel( 1, "Worker Nodes Ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', {"h": 4, "w": 8, "x": 0, "y": 0}, value_suffix=WORKER_SUFFIX, ) ) panels.append( stat_panel( 2, "Control Plane Ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', {"h": 4, "w": 8, "x": 8, "y": 0}, value_suffix=CONTROL_SUFFIX, ) ) panels.append( stat_panel( 3, "Control Plane Workloads", f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', {"h": 4, "w": 8, "x": 16, "y": 0}, ) ) panels.append( stat_panel( 9, "API Server 5xx rate", APISERVER_5XX_RATE, {"h": 4, "w": 8, "x": 0, "y": 4}, unit="req/s", thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 0.05}, {"color": "orange", "value": 0.2}, {"color": "red", "value": 0.5}, ], }, decimals=3, ) ) panels.append( stat_panel( 10, "API Server P99 latency", APISERVER_P99_LATENCY_MS, {"h": 4, "w": 8, "x": 8, "y": 4}, unit="ms", thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 250}, {"color": "orange", "value": 400}, {"color": "red", "value": 600}, ], }, decimals=1, ) ) panels.append( stat_panel( 11, "etcd P99 latency", ETCD_P99_LATENCY_MS, {"h": 4, "w": 8, "x": 16, "y": 4}, unit="ms", thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, {"color": "orange", "value": 100}, {"color": "red", "value": 200}, ], }, decimals=1, ) ) panels.append( timeseries_panel( 4, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 8}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 5, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 17}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 6, "Control Plane (incl. titan-db) CPU", node_cpu_expr(CONTROL_ALL_REGEX), {"h": 9, "w": 12, "x": 0, "y": 26}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 7, "Control Plane (incl. titan-db) RAM", node_mem_expr(CONTROL_ALL_REGEX), {"h": 9, "w": 12, "x": 12, "y": 26}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 8, "Root Filesystem Usage", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 35}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d", ) ) panels.append( timeseries_panel( 9, "Astraios Usage", astraios_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 44}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d", ) ) return { "uid": "atlas-nodes", "title": "Atlas Nodes", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "nodes"], } def build_storage_dashboard(): panels = [] panels.append( stat_panel( 1, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent", thresholds=PERCENT_THRESHOLDS, ) ) panels.append( stat_panel( 2, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent", thresholds=PERCENT_THRESHOLDS, ) ) panels.append( stat_panel( 3, "Astreae Free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="decbytes", ) ) panels.append( stat_panel( 4, "Asteria Free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="decbytes", ) ) panels.append( timeseries_panel( 5, "Astreae Per-Node Usage", filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d", ) ) panels.append( timeseries_panel( 6, "Asteria Per-Node Usage", filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 12, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d", ) ) panels.append( timeseries_panel( 7, "Astreae Usage History", astreae_usage_expr("/mnt/astreae"), {"h": 9, "w": 12, "x": 0, "y": 14}, unit="percent", time_from="90d", ) ) panels.append( timeseries_panel( 8, "Asteria Usage History", astreae_usage_expr("/mnt/asteria"), {"h": 9, "w": 12, "x": 12, "y": 14}, unit="percent", time_from="90d", ) ) panels.append( stat_panel( 30, "Maintenance Sweepers Ready", 'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100', {"h": 4, "w": 12, "x": 0, "y": 44}, unit="percent", thresholds=PERCENT_THRESHOLDS, ) ) panels.append( stat_panel( 31, "Maintenance Cron Freshness (s)", 'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})', {"h": 4, "w": 12, "x": 12, "y": 44}, unit="s", thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 3600}, {"color": "red", "value": 10800}, ], }, ) ) return { "uid": "atlas-storage", "title": "Atlas Storage", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "storage"], } def build_network_dashboard(): panels = [] panels.append( stat_panel( 1, "Ingress Success Rate (5m)", TRAEFIK_SLI_5M, {"h": 4, "w": 6, "x": 0, "y": 0}, unit="percentunit", decimals=2, thresholds={ "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": 0.995}, {"color": "yellow", "value": 0.999}, {"color": "green", "value": 0.9995}, ], }, ) ) panels.append( stat_panel( 2, "Error Budget Burn (1h)", traefik_burn("1h"), {"h": 4, "w": 6, "x": 6, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 4}, ], }, decimals=2, ) ) panels.append( stat_panel( 3, "Error Budget Burn (6h)", traefik_burn("6h"), {"h": 4, "w": 6, "x": 12, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 4}, ], }, decimals=2, ) ) panels.append( stat_panel( 4, "Edge P99 Latency (ms)", TRAEFIK_P99_LATENCY_MS, {"h": 4, "w": 6, "x": 18, "y": 0}, unit="ms", thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 200}, {"color": "orange", "value": 350}, {"color": "red", "value": 500}, ], }, decimals=1, ) ) panels.append( stat_panel( 5, "Ingress Traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 4}, unit="Bps", ) ) panels.append( stat_panel( 6, "Egress Traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 4}, unit="Bps", ) ) panels.append( stat_panel( 7, "Intra-Cluster Traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 4}, unit="Bps", ) ) panels.append( timeseries_panel( 8, "Per-Node Throughput", f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})', {"h": 8, "w": 24, "x": 0, "y": 8}, unit="Bps", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( table_panel( 9, "Top Namespaces", 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 16}, unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 10, "Top Pods", 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', {"h": 9, "w": 12, "x": 12, "y": 16}, unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( timeseries_panel( 11, "Traefik Routers (req/s)", f"topk(10, {TRAEFIK_ROUTER_EXPR})", {"h": 9, "w": 12, "x": 0, "y": 25}, unit="req/s", legend="{{router}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 12, "Traefik Entrypoints (req/s)", 'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))', {"h": 9, "w": 12, "x": 12, "y": 25}, unit="req/s", legend="{{entrypoint}}", legend_display="table", legend_placement="right", ) ) return { "uid": "atlas-network", "title": "Atlas Network", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "network"], } def build_mail_dashboard(): panels = [] bounce_rate_thresholds = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 5}, {"color": "orange", "value": 8}, {"color": "red", "value": 10}, ], } limit_thresholds = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 70}, {"color": "orange", "value": 85}, {"color": "red", "value": 95}, ], } success_thresholds = { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": 90}, {"color": "yellow", "value": 95}, {"color": "green", "value": 98}, ], } panels.append( stat_panel( 1, "Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', {"h": 4, "w": 6, "x": 0, "y": 0}, decimals=0, ) ) panels.append( stat_panel( 2, "Sent (7d)", 'max(postmark_outbound_sent{window="7d"})', {"h": 4, "w": 6, "x": 6, "y": 0}, decimals=0, ) ) panels.append( { "id": 3, "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, "gridPos": {"h": 4, "w": 6, "x": 12, "y": 0}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', "refId": "A", "legendFormat": "Rate", }, { "expr": 'max(postmark_outbound_bounced{window="1d"})', "refId": "B", "legendFormat": "Count", }, ], "fieldConfig": { "defaults": { "color": {"mode": "thresholds"}, "custom": {"displayMode": "auto"}, "thresholds": bounce_rate_thresholds, "unit": "none", }, "overrides": [ { "matcher": {"id": "byName", "options": "Rate"}, "properties": [{"id": "unit", "value": "percent"}], }, { "matcher": {"id": "byName", "options": "Count"}, "properties": [{"id": "unit", "value": "none"}], }, ], }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "textMode": "name_and_value", }, } ) panels.append( stat_panel( 4, "Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', {"h": 4, "w": 6, "x": 18, "y": 0}, unit="percent", thresholds=success_thresholds, decimals=1, ) ) panels.append( stat_panel( 5, "Limit Used (30d)", "max(postmark_sending_limit_used_percent)", {"h": 4, "w": 6, "x": 0, "y": 4}, thresholds=limit_thresholds, unit="percent", decimals=1, ) ) panels.append( stat_panel( 6, "Send Limit (30d)", "max(postmark_sending_limit)", {"h": 4, "w": 6, "x": 6, "y": 4}, decimals=0, ) ) panels.append( stat_panel( 7, "Last Success", "max(postmark_last_success_timestamp_seconds)", {"h": 4, "w": 6, "x": 12, "y": 4}, unit="dateTimeAsIso", decimals=0, ) ) panels.append( stat_panel( 8, "Exporter Errors", "sum(postmark_request_errors_total)", {"h": 4, "w": 6, "x": 18, "y": 4}, decimals=0, ) ) panels.append( timeseries_panel( 13, "Bounce Rate (1d vs 7d)", "max by (window) (postmark_outbound_bounce_rate)", {"h": 8, "w": 12, "x": 0, "y": 12}, unit="percent", legend="{{window}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 14, "Bounced (1d vs 7d)", "max by (window) (postmark_outbound_bounced)", {"h": 8, "w": 12, "x": 12, "y": 12}, unit="none", legend="{{window}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 15, "Sent (1d vs 7d)", "max by (window) (postmark_outbound_sent)", {"h": 8, "w": 12, "x": 0, "y": 20}, unit="none", legend="{{window}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 16, "Exporter Errors", "sum(postmark_request_errors_total)", {"h": 8, "w": 12, "x": 12, "y": 20}, unit="none", ) ) return { "uid": "atlas-mail", "title": "Atlas Mail", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-30d", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "mail"], } def build_jobs_dashboard(): panels = [] suite_var = "${suite:regex}" test_var = "${test:regex}" branch_var = "${branch:regex}" success = PLATFORM_TEST_SUCCESS_STATUS exported = PLATFORM_TEST_EXPORT_FILTER runs_selector = f'suite=~"{suite_var}",{exported}' runs_success_selector = f'{runs_selector},status=~"{success}"' runs_failure_selector = f'{runs_selector},status!~"{success}"' checks_selector = f'__name__=~".*_quality_gate_checks_total",suite=~"{suite_var}",{exported}' coverage_metric_selector = f'__name__=~".*_quality_gate_coverage_percent",suite=~"{suite_var}",{exported}' workspace_coverage_selector = f'suite=~"{suite_var}",{exported}' smell_selector = f'suite=~"{suite_var}",{exported}' build_info_selector = f'suite=~"{suite_var}",branch!="",branch=~"{branch_var}",{exported}' selected_suite_universe = ( f'(count by (suite) (platform_quality_gate_build_info{{{build_info_selector}}}) >= bool 0)' ) selected_suite_zero = f"(0 * {selected_suite_universe})" suite_universe = " or ".join( f'label_replace(vector(1), "suite", "{suite}", "__name__", ".*")' for suite in PLATFORM_TEST_SUITE_NAMES ) runs_24h = f'(sum(increase(platform_quality_gate_runs_total{{{runs_selector}}}[24h])) or on() vector(0))' runs_30d = f'(sum(increase(platform_quality_gate_runs_total{{{runs_selector}}}[30d])) or on() vector(0))' success_24h = ( f'(sum(increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[24h])) or on() vector(0))' ) success_30d = ( f'(sum(increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[30d])) or on() vector(0))' ) failures_24h = ( f'(sum(increase(platform_quality_gate_runs_total{{{runs_failure_selector}}}[24h])) or on() vector(0))' ) success_rate_24h = f"100 * ({success_24h}) / clamp_min(({runs_24h}), 1)" success_rate_30d = f"100 * ({success_30d}) / clamp_min(({runs_30d}), 1)" runs_by_suite_24h = f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[24h]))' success_by_suite_24h = ( f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[24h]))' ) success_rate_by_suite_24h = ( f'sort_desc(((100 * ({success_by_suite_24h}) / clamp_min(({runs_by_suite_24h}), 1)) ' f'and on(suite) (({runs_by_suite_24h}) > 0)) ' f'or on(suite) ((0 * ({runs_by_suite_24h})) - 1))' ) non_failure = PLATFORM_TEST_NON_FAILURE_STATUS current_gate_health_by_suite = ( f'(100 * sum by (suite) (max by (suite, check) (({{{checks_selector},result=~"{non_failure}"}} > bool 0))) ' f'/ clamp_min(sum by (suite) (max by (suite, check) (({{{checks_selector}}} > bool 0))), 1)) ' f'or on(suite) ({selected_suite_zero})' ) success_history_runs = f'sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[7d]))' success_history_by_suite = ( f'(100 * sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[7d])) ' f'/ ({success_history_runs})) and on(suite) (({success_history_runs}) > 0)' ) daily_success_volume = ( f'sum(increase(platform_quality_gate_runs_total{{{runs_success_selector}}}[24h])) or on() vector(0)' ) daily_failure_volume = ( f'sum(increase(platform_quality_gate_runs_total{{{runs_failure_selector}}}[24h])) or on() vector(0)' ) coverage_by_suite = ( f'(max by (suite) ({{{coverage_metric_selector}}})) ' f'or on(suite) (max by (suite) (platform_quality_gate_workspace_line_coverage_percent{{{workspace_coverage_selector}}}))' ) coverage_history_by_suite = ( f'(max by (suite) (platform_quality_gate_workspace_line_coverage_percent{{{workspace_coverage_selector}}})) ' f'or on(suite) (max by (suite) ({{{coverage_metric_selector}}}))' ) coverage_with_missing = ( f"({coverage_by_suite}) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[30d])))) - 1)" ) smell_by_suite = f'max by (suite) (platform_quality_gate_source_lines_over_500_total{{{smell_selector}}})' loc_files_by_suite = f'max by (suite) (platform_quality_gate_source_files_total{{{smell_selector}}})' smell_with_missing = ( f"({smell_by_suite}) or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[30d])))) - 1)" ) loc_limit_compliance_by_suite = ( f"(100 * clamp_min(({loc_files_by_suite}) - ({smell_by_suite}), 0) / ({loc_files_by_suite})) " f"and on(suite) (({loc_files_by_suite}) > 0)" ) loc_limit_compliance_with_missing = ( f"({loc_limit_compliance_by_suite}) " f"or on(suite) (100 * (1 - clamp_max(({smell_by_suite}), 1))) " f"or on(suite) ((0 * (sum by (suite) (increase(platform_quality_gate_runs_total{{{runs_selector}}}[30d])))) - 1)" ) loc_limit_compliance_history = ( f"({loc_limit_compliance_by_suite}) " f"or on(suite) (100 * (1 - clamp_max(({smell_by_suite}), 1)))" ) average_coverage = f"(avg(({coverage_by_suite})) or on() vector(0))" suites_loc_violating = f'(sum((({smell_by_suite}) > bool 0)) or on() vector(0))' check_regex_tests = "tests|unit|build" check_regex_coverage = "coverage" check_regex_loc = "loc|smell" check_regex_style = "docs|naming|hygiene|lint|docs_naming|style" check_regex_gate_glue = "gate|glue|gate_glue" check_regex_sonarqube = "sonarqube|sonar" check_regex_supply_chain = "ironbank|supply_chain|image_compliance|artifact_security" def _check_state_percent_series(regex: str, failed: bool) -> str: state = f'result!~"{non_failure}"' if failed else f'result=~"{non_failure}"' state_checks = ( f'sum by (suite) (max by (suite, check) (({{{checks_selector},check=~"{regex}",{state}}} > bool 0)))' ) total_checks = ( f'sum by (suite) (max by (suite, check) (({{{checks_selector},check=~"{regex}"}} > bool 0)))' ) state_percent = f"(100 * ({state_checks}) / clamp_min(({total_checks}), 1))" return f"(({state_percent}) or on(suite) ({selected_suite_zero}))" rollup_failed_tests = ( f'sum by (suite, test) (platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}})' ) raw_failed_tests = ( f'sum by (suite, test) (max_over_time(platform_quality_gate_test_case_result{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",{exported},status="failed"}}[$__interval]))' ) problematic_tests_history_core = f"topk(12, (({rollup_failed_tests}) or on(suite, test) ({raw_failed_tests})))" problematic_tests_history = problematic_tests_history_core rollup_failed_tests_30d = ( f'sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}[30d:1h]))' ) raw_failed_tests_30d = ( f'sum by (suite, test) (increase(platform_quality_gate_test_case_result{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",{exported},status="failed"}}[30d]))' ) worst_test_per_suite_core = ( f"topk by (suite) (1, (({rollup_failed_tests_30d}) or on(suite, test) ({raw_failed_tests_30d})))" ) worst_test_per_suite = worst_test_per_suite_core def _selected_status_volume(status: str) -> str: return ( f'(sum(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",' f'branch=~"{branch_var}",test!="",test=~"{test_var}",test!="__no_test_cases__",' f'status="{status}"}}) or on() vector(0))' ) selected_test_pass_fail = [ { "refId": "A", "expr": _selected_status_volume("passed"), "legendFormat": "Passed", }, { "refId": "B", "expr": _selected_status_volume("failed"), "legendFormat": "Failed", }, { "refId": "C", "expr": _selected_status_volume("skipped"), "legendFormat": "Skipped", }, ] selected_test_pass_rate = ( f'avg by (suite) (platform_quality:test_case_pass_rate:percent_1h{{suite=~"{suite_var}",' f'branch!="",branch=~"{branch_var}",test!="",test=~"{test_var}",test!="__no_test_cases__"}})' ) recent_branch_evidence = ( f'sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[30d])))' ) non_primary_branch_evidence = ( f'count by (suite) (max_over_time(platform_quality_gate_build_info{{{build_info_selector},branch!~"main|master|origin/main|origin/master|unknown"}}[30d]))' ) branch_evidence_by_suite = ( f'count by (suite) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[30d]))' ) primary_branch_clean_by_suite = ( f'sort_desc((100 * ((({branch_evidence_by_suite}) > bool 0) ' f'unless on(suite) (({non_primary_branch_evidence}) > bool 0))) ' f'or on(suite) (0 * (({branch_evidence_by_suite}) > bool 0)))' ) def _missing_suite_series(presence_expr: str) -> str: missing = f"(({suite_universe}) unless on(suite) {presence_expr})" return f"({missing}) or on(suite) (0 * ({suite_universe}))" def _present_suite_percent(presence_expr: str) -> str: present = f"(({suite_universe}) and on(suite) {presence_expr})" return f"(100 * ({present})) or on(suite) (0 * ({suite_universe}))" present_tests_by_suite = _present_suite_percent( f'count by (suite) ({{__name__=~".*_quality_gate_tests_total",{exported}}})' ) present_checks_by_suite = _present_suite_percent( f'count by (suite) ({{__name__=~".*_quality_gate_checks_total",{exported}}})' ) present_coverage_by_suite = _present_suite_percent( f"count by (suite) (platform_quality_gate_workspace_line_coverage_percent{{{exported}}})" ) present_loc_by_suite = _present_suite_percent( f"count by (suite) (platform_quality_gate_source_lines_over_500_total{{{exported}}}) " f"and on(suite) count by (suite) (platform_quality_gate_source_files_total{{{exported}}})" ) present_test_case_by_suite = _present_suite_percent( f"count by (suite) (platform_quality_gate_test_case_result{{{exported}}})" ) real_test_case_by_suite = _present_suite_percent( f'count by (suite) (platform_quality_gate_test_case_result{{{exported},test!="__no_test_cases__"}})' ) dark_red = "dark-red" dark_orange = "dark-orange" dark_yellow = "dark-yellow" dark_green = "dark-green" dark_blue = "dark-blue" success_thresholds = { "mode": "absolute", "steps": [ {"color": dark_red, "value": None}, {"color": dark_orange, "value": 90}, {"color": dark_yellow, "value": 93}, {"color": dark_green, "value": 95}, {"color": dark_blue, "value": 100}, ], } coverage_thresholds = success_thresholds failures_thresholds = { "mode": "absolute", "steps": [ {"color": dark_blue, "value": None}, {"color": dark_green, "value": 0.01}, {"color": dark_yellow, "value": 1}, {"color": dark_orange, "value": 3}, {"color": dark_red, "value": 5}, ], } smell_thresholds = { "mode": "absolute", "steps": [ {"color": dark_red, "value": None}, {"color": dark_green, "value": 0}, {"color": dark_yellow, "value": 1}, {"color": dark_orange, "value": 3}, {"color": dark_red, "value": 5}, ], } missing_thresholds = { "mode": "absolute", "steps": [ {"color": dark_green, "value": None}, {"color": dark_red, "value": 1}, ], } panels.append( stat_panel( 2, "Run Reliability (24h)", success_rate_24h, {"h": 5, "w": 4, "x": 0, "y": 0}, unit="percent", decimals=2, instant=True, thresholds=success_thresholds, ) ) panels.append( stat_panel( 3, "Run Reliability (30d)", success_rate_30d, {"h": 5, "w": 4, "x": 4, "y": 0}, unit="percent", decimals=2, instant=True, thresholds=success_thresholds, ) ) panels.append( stat_panel( 4, "Failed Runs (24h)", failures_24h, {"h": 5, "w": 4, "x": 8, "y": 0}, unit="none", instant=True, thresholds=failures_thresholds, ) ) panels.append( stat_panel( 5, "Runs (24h)", runs_24h, {"h": 5, "w": 4, "x": 12, "y": 0}, unit="none", instant=True, thresholds={ "mode": "absolute", "steps": [{"color": dark_red, "value": None}, {"color": dark_green, "value": 1}], }, ) ) panels.append( stat_panel( 6, "Avg Coverage (%)", average_coverage, {"h": 5, "w": 4, "x": 16, "y": 0}, unit="percent", decimals=2, instant=True, thresholds=success_thresholds, ) ) panels.append( stat_panel( 7, "Suites with LOC >500", suites_loc_violating, {"h": 5, "w": 4, "x": 20, "y": 0}, unit="none", instant=True, thresholds=smell_thresholds, ) ) panels.append( bargauge_panel( 8, "Current Gate Health by Suite", current_gate_health_by_suite, {"h": 8, "w": 8, "x": 0, "y": 5}, unit="percent", instant=True, legend="{{suite}}", sort_order="asc", thresholds=success_thresholds, decimals=2, ) ) panels[-1]["description"] = ( "Current pass percentage across the required gate dimensions reported by each suite. " "This is the fastest place to answer whether the latest suite quality signal is healthy." ) reliability_suite_panel = bargauge_panel( 9, "Run Reliability by Suite (24h)", success_rate_by_suite_24h, {"h": 8, "w": 8, "x": 8, "y": 5}, unit="percent", instant=True, legend="{{suite}}", sort_order="asc", thresholds=success_thresholds, decimals=2, ) reliability_suite_panel["description"] = ( "Rolling CI run success rate. This can stay low after failed/debug runs even when " "Current Gate Health is green." ) reliability_suite_panel["fieldConfig"]["defaults"]["mappings"] = [ {"type": "value", "options": {"-1": {"text": "no runs"}}} ] panels.append(reliability_suite_panel) history_panel = state_timeline_panel( 11, "Run Reliability by Suite (7d rolling)", success_history_by_suite, {"h": 8, "w": 24, "x": 0, "y": 13}, thresholds=success_thresholds, description=( "Seven-day rolling run success rate per suite. Each suite gets its own lane, " "so brief failed/debug runs lower the lane color without creating unreadable 0/100 spikes." ), ) panels.append(history_panel) run_volume_panel = timeseries_panel( 12, "Daily Run Volume (Selected Scope)", None, {"h": 8, "w": 8, "x": 0, "y": 21}, unit="none", targets=[ {"refId": "A", "expr": daily_success_volume, "legendFormat": "Success"}, {"refId": "B", "expr": daily_failure_volume, "legendFormat": "Failure"}, ], legend_display="list", legend_placement="bottom", legend_calcs=[], ) run_volume_panel["description"] = ( "Twenty-four-hour rolling run counts for the selected suite/branch scope. " "This is volume, not a pass-rate percentage." ) run_volume_panel["fieldConfig"]["defaults"]["min"] = 0 run_volume_panel["fieldConfig"]["defaults"]["custom"] = { "drawStyle": "bars", "barAlignment": 0, "lineWidth": 0, "fillOpacity": 70, "stacking": {"mode": "normal", "group": "A"}, } panels.append(run_volume_panel) panels.append( state_timeline_panel( 13, "Coverage History by Suite", coverage_history_by_suite, {"h": 8, "w": 8, "x": 8, "y": 21}, thresholds=coverage_thresholds, description=( "Latest reported line coverage per suite over time. Coverage is separate " "from LOC compliance so one signal cannot hide the other." ), ) ) panels.append( state_timeline_panel( 14, "Files <=500 LOC History by Suite", loc_limit_compliance_history, {"h": 8, "w": 8, "x": 16, "y": 21}, thresholds=success_thresholds, description=( "Percent of LOC-gated source files at or under the 500-line limit. " "This uses the existing file-count telemetry; longest-file history needs a new publisher metric." ), ) ) check_dimensions = [ ("Tests", check_regex_tests), ("Coverage", check_regex_coverage), ("LOC", check_regex_loc), ("Style", check_regex_style), ("Gate Glue", check_regex_gate_glue), ("SonarQube", check_regex_sonarqube), ("Supply Chain", check_regex_supply_chain), ] def _append_check_trends(start_id: int, title_prefix: str, failed: bool, y: int) -> None: trend_thresholds = failures_thresholds if failed else success_thresholds trend_description = ( "Current bad-state percentage for this check family, evaluated over time. " "Higher means more of the selected suites/checks are failing right now; this is not an event-count spike chart." if failed else "Current acceptable-state percentage for this check family, evaluated over time. " "Higher means more of the selected suites/checks are healthy right now; gaps mean there was no check evidence." ) for index, (label, regex) in enumerate(check_dimensions[:4]): panel = state_timeline_panel( start_id + index, f"{label} {title_prefix}", _check_state_percent_series(regex, failed), {"h": 7, "w": 6, "x": index * 6, "y": y}, thresholds=trend_thresholds, description=trend_description, ) panels.append(panel) for index, (label, regex) in enumerate(check_dimensions[4:]): panel = state_timeline_panel( start_id + 4 + index, f"{label} {title_prefix}", _check_state_percent_series(regex, failed), {"h": 7, "w": 8, "x": index * 8, "y": y + 7}, thresholds=trend_thresholds, description=trend_description, ) panels.append(panel) _append_check_trends(130, "Failure Rate", True, 29) _append_check_trends(138, "Healthy Rate", False, 43) panels.append( state_timeline_panel( 145, "Problematic Tests Over Time (Top failures)", problematic_tests_history, {"h": 8, "w": 12, "x": 0, "y": 57}, thresholds=failures_thresholds, unit="none", min_value=0, max_value=None, legend="{{suite}} - {{test}}", description=( "Top failing test cases over time, using memoized hourly rollups. " "Blank branch/test labels and placeholder no-test-case rows are excluded." ), ) ) panels[-1]["links"] = jenkins_suite_links() panels[-1]["fieldConfig"]["defaults"]["links"] = jenkins_latest_artifact_data_links() panels.append( bargauge_panel( 147, "Most Problematic Test by Suite (30d)", worst_test_per_suite, {"h": 8, "w": 12, "x": 12, "y": 57}, unit="none", instant=True, legend="{{suite}} · {{test}}", sort_order="desc", thresholds=failures_thresholds, limit=9, links=jenkins_suite_links(), data_links=jenkins_latest_artifact_data_links(), ) ) panels.append( timeseries_panel( 146, "Selected Test Pass/Fail History", None, {"h": 8, "w": 12, "x": 0, "y": 65}, unit="none", targets=selected_test_pass_fail, legend_display="list", legend_placement="bottom", legend_calcs=[], links=jenkins_suite_links(), data_links=jenkins_artifact_data_links(), ) ) panels[-1]["description"] = ( "Stacked hourly outcome volume for the selected suite/branch/test scope. " "This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans." ) panels[-1]["fieldConfig"]["defaults"]["min"] = 0 panels[-1]["fieldConfig"]["defaults"]["custom"] = { "drawStyle": "bars", "barAlignment": 0, "lineWidth": 0, "fillOpacity": 70, "stacking": {"mode": "normal", "group": "A"}, } selected_pass_rate_panel = state_timeline_panel( 152, "Selected Test Pass Rate History", selected_test_pass_rate, {"h": 8, "w": 12, "x": 12, "y": 65}, thresholds=success_thresholds, legend="{{suite}}", description=( "Average pass rate per suite for the selected test filter, using memoized hourly " "test-case pass-rate rollups instead of raw historical scans." ), ) selected_pass_rate_panel["links"] = jenkins_suite_links() selected_pass_rate_panel["fieldConfig"]["defaults"]["links"] = jenkins_artifact_data_links() panels.append(selected_pass_rate_panel) coverage_panel = bargauge_panel( 17, "Coverage by Suite (Latest, gate 95)", coverage_with_missing, {"h": 8, "w": 12, "x": 0, "y": 73}, unit="percent", instant=True, legend="{{suite}}", sort_order="asc", thresholds=coverage_thresholds, decimals=2, ) coverage_panel["fieldConfig"]["defaults"]["mappings"] = [ {"type": "value", "options": {"-1": {"text": "missing"}}} ] panels.append(coverage_panel) smell_panel = bargauge_panel( 18, "Files <=500 LOC by Suite (Latest)", loc_limit_compliance_with_missing, {"h": 8, "w": 12, "x": 12, "y": 73}, unit="percent", instant=True, legend="{{suite}}", sort_order="asc", thresholds=success_thresholds, decimals=0, ) smell_panel["fieldConfig"]["defaults"]["mappings"] = [ {"type": "value", "options": {"-1": {"text": "missing"}}} ] smell_panel["description"] = "Percent of managed LOC-gated files at or under 500 lines. Older suite payloads fall back to 100%/0% until they emit platform_quality_gate_source_files_total." panels.append(smell_panel) panels.append( bargauge_panel( 27, "Tests Metrics Present by Suite", present_tests_by_suite, {"h": 7, "w": 6, "x": 0, "y": 81}, unit="percent", instant=True, legend="{{suite}}", sort_order="desc", thresholds=success_thresholds, decimals=0, ) ) panels.append( bargauge_panel( 28, "Checks Metrics Present by Suite", present_checks_by_suite, {"h": 7, "w": 6, "x": 6, "y": 81}, unit="percent", instant=True, legend="{{suite}}", sort_order="desc", thresholds=success_thresholds, decimals=0, ) ) panels.append( bargauge_panel( 29, "Coverage Metrics Present by Suite", present_coverage_by_suite, {"h": 7, "w": 6, "x": 12, "y": 81}, unit="percent", instant=True, legend="{{suite}}", sort_order="desc", thresholds=success_thresholds, decimals=0, ) ) panels.append( bargauge_panel( 30, "LOC Compliance Metrics Present by Suite", present_loc_by_suite, {"h": 7, "w": 6, "x": 18, "y": 81}, unit="percent", instant=True, legend="{{suite}}", sort_order="desc", thresholds=success_thresholds, decimals=0, ) ) panels.append( stat_panel( 31, "SonarQube API Up", "(max(sonarqube_up) or on() vector(0))", {"h": 6, "w": 4, "x": 0, "y": 88}, unit="none", instant=True, thresholds={ "mode": "absolute", "steps": [ {"color": dark_red, "value": None}, {"color": dark_green, "value": 1}, ], }, ) ) panels.append( stat_panel( 32, "Sonar Projects (Selected)", f'(count(max by (project_key) (sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}"}})) or on() vector(0))', {"h": 6, "w": 4, "x": 4, "y": 88}, unit="none", instant=True, thresholds=failures_thresholds, ) ) panels.append( stat_panel( 33, "Sonar Gate Fetch Errors", "(max(sonarqube_quality_gate_fetch_errors_total) or on() vector(0))", {"h": 6, "w": 4, "x": 8, "y": 88}, unit="none", instant=True, thresholds=failures_thresholds, ) ) sonar_status_mix_panel = pie_panel( 34, "Sonar Gate Status Mix (Selected)", f'count by (status) (max by (project_key, status) (sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}"}}))', {"h": 6, "w": 4, "x": 12, "y": 88}, ) sonar_status_mix_panel["targets"][0]["legendFormat"] = "{{status}}" panels.append(sonar_status_mix_panel) panels.append( state_timeline_panel( 35, "Sonar Gate Health by Project", f'100 * max by (project_key) (sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}"}})', {"h": 6, "w": 8, "x": 16, "y": 88}, thresholds=success_thresholds, unit="percent", min_value=0, max_value=100, legend="{{project_key}}", description=( "SonarQube gate status over time by project. OK projects render as full healthy lanes; " "non-OK projects drop to red without disappearing." ), ) ) panels.append( bargauge_panel( 148, "Test-Case Metrics Present by Suite", present_test_case_by_suite, {"h": 6, "w": 12, "x": 0, "y": 94}, unit="percent", instant=True, legend="{{suite}}", sort_order="desc", thresholds=success_thresholds, decimals=0, ) ) panels.append( bargauge_panel( 151, "Real Test Cases Present by Suite", real_test_case_by_suite, {"h": 6, "w": 12, "x": 12, "y": 94}, unit="percent", instant=True, legend="{{suite}}", sort_order="desc", thresholds=success_thresholds, decimals=0, ) ) panels.append( bargauge_panel( 149, "Recent Branch Evidence by Suite (30d)", recent_branch_evidence, {"h": 7, "w": 12, "x": 0, "y": 100}, unit="none", instant=True, legend="{{suite}} · {{branch}}", sort_order="desc", thresholds=missing_thresholds, decimals=0, links=jenkins_suite_links(), ) ) panels.append( bargauge_panel( 150, "Primary Branch Clean by Suite (30d)", primary_branch_clean_by_suite, {"h": 7, "w": 12, "x": 12, "y": 100}, unit="percent", instant=True, legend="{{suite}}", sort_order="desc", thresholds=success_thresholds, decimals=0, links=jenkins_suite_links(), ) ) # Keep the first paint intentionally light. The detailed matrices remain # available, but they stay collapsed so browsers do not render every series # and legend before the operator asks for them. panel_by_id = {panel["id"]: panel for panel in panels} visible_layout = { 2: {"h": 4, "w": 4, "x": 0, "y": 0}, 3: {"h": 4, "w": 4, "x": 4, "y": 0}, 4: {"h": 4, "w": 4, "x": 8, "y": 0}, 5: {"h": 4, "w": 4, "x": 12, "y": 0}, 6: {"h": 4, "w": 4, "x": 16, "y": 0}, 7: {"h": 4, "w": 4, "x": 20, "y": 0}, 8: {"h": 7, "w": 12, "x": 0, "y": 4}, 9: {"h": 7, "w": 12, "x": 12, "y": 4}, 17: {"h": 7, "w": 12, "x": 0, "y": 11}, 18: {"h": 7, "w": 12, "x": 12, "y": 11}, } compact_panels = [] for panel_id, grid in visible_layout.items(): panel = panel_by_id[panel_id] panel["gridPos"] = grid compact_panels.append(panel) def children(ids): return [panel_by_id[panel_id] for panel_id in ids] row_layout = { 11: {"h": 8, "w": 12, "x": 0, "y": 19}, 12: {"h": 8, "w": 12, "x": 12, "y": 19}, 13: {"h": 8, "w": 12, "x": 0, "y": 27}, 14: {"h": 8, "w": 12, "x": 12, "y": 27}, 145: {"h": 8, "w": 24, "x": 0, "y": 74}, 147: {"h": 8, "w": 8, "x": 0, "y": 83}, 146: {"h": 8, "w": 8, "x": 8, "y": 83}, 152: {"h": 8, "w": 8, "x": 16, "y": 83}, 27: {"h": 7, "w": 6, "x": 0, "y": 94}, 28: {"h": 7, "w": 6, "x": 6, "y": 94}, 29: {"h": 7, "w": 6, "x": 12, "y": 94}, 30: {"h": 7, "w": 6, "x": 18, "y": 94}, 148: {"h": 7, "w": 6, "x": 0, "y": 101}, 151: {"h": 7, "w": 6, "x": 6, "y": 101}, 149: {"h": 7, "w": 6, "x": 12, "y": 101}, 150: {"h": 7, "w": 6, "x": 18, "y": 101}, 31: {"h": 6, "w": 4, "x": 0, "y": 111}, 32: {"h": 6, "w": 4, "x": 4, "y": 111}, 33: {"h": 6, "w": 4, "x": 8, "y": 111}, 34: {"h": 6, "w": 4, "x": 12, "y": 111}, 35: {"h": 6, "w": 8, "x": 16, "y": 111}, } for panel_id, grid in row_layout.items(): panel_by_id[panel_id]["gridPos"] = grid compact_panels.extend( [ row_panel(500, "Reliability And Run History", 18, panels=children([11, 12, 13, 14])), row_panel( 501, "Check Failure Rates By Suite", 19, panels=children([130, 131, 132, 133, 134, 135, 136]), ), row_panel( 502, "Check Healthy Rates By Suite", 20, panels=children([138, 139, 140, 141, 142, 143, 144]), ), row_panel( 503, "Test Drilldowns And Problem Tests", 21, panels=children([145, 147, 146, 152]), ), row_panel( 504, "Telemetry Completeness And Branches", 22, panels=children([27, 28, 29, 30, 148, 151, 149, 150]), ), row_panel( 505, "SonarQube Project Health", 23, panels=children([31, 32, 33, 34, 35]), ), ] ) panels = compact_panels set_bargauge_display_mode(panels, "basic") return { "uid": "atlas-jobs", "title": "Atlas Testing", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-30d", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "testing", "quality-gate", "ci"], "templating": { "list": [ testing_suite_variable(), testing_branch_variable(), testing_case_variable(), jenkins_base_variable(), ] }, } def build_testing_dashboard(): dashboard = build_jobs_dashboard() dashboard["uid"] = "atlas-testing" dashboard["folderUid"] = PUBLIC_DASHBOARD_FOLDER dashboard["editable"] = False return dashboard def build_gitops_dashboard(): gitops_success_thresholds = { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "yellow", "value": 99}, {"color": "blue", "value": 100}, ], } gitops_value_overrides = [ { "matcher": {"id": "byName", "options": "Value"}, "properties": [ { "id": "thresholds", "value": { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "blue", "value": 1}, ], }, } ], } ] kustomization_table = ( f"max by (namespace, name, path, source_namespace, source_name, revision, ready, reason) " f"(ananke_gitops_kustomization_info{{{GITOPS_SELECTOR}}}) " f"* on(namespace, name) group_left() max by (namespace, name) " f"(ananke_gitops_kustomization_ready{{{GITOPS_SELECTOR}}})" ) helm_table = ( f"max by (namespace, name, chart, version, app_version, revision, ready, reason) " f"(ananke_gitops_helmrelease_info{{{GITOPS_SELECTOR}}}) " f"* on(namespace, name) group_left() max by (namespace, name) " f"(ananke_gitops_helmrelease_ready{{{GITOPS_SELECTOR}}})" ) source_table = ( f"max by (namespace, name, url, branch, revision, ready, reason) " f"(ananke_gitops_flux_source_info{{{GITOPS_SELECTOR}}}) " f"* on(namespace, name) group_left() max by (namespace, name) " f"(ananke_gitops_flux_source_ready{{{GITOPS_SELECTOR}}})" ) panels = [ stat_panel( 1, "Flux Source", f"{GITOPS_SOURCE_INFO} or on() vector(0)", {"h": 4, "w": 8, "x": 0, "y": 0}, text_mode="name", targets=[{"expr": f"{GITOPS_SOURCE_INFO} or on() vector(0)", "refId": "A", "legendFormat": "{{branch}} · {{revision}}", "instant": True}], thresholds={ "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "blue", "value": 1}, ], }, description="Branch and revision currently reported by Flux's GitRepository source.", ), stat_panel( 2, "Kustomizations Ready", GITOPS_KUSTOMIZATION_READY_PCT, {"h": 4, "w": 4, "x": 8, "y": 0}, unit="percent", decimals=1, thresholds={ "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "yellow", "value": 99}, {"color": "blue", "value": 100}, ], }, ), stat_panel( 3, "Kustomizations Suspended", GITOPS_KUSTOMIZATION_SUSPENDED, {"h": 4, "w": 4, "x": 12, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "blue", "value": None}, {"color": "red", "value": 1}, ], }, ), stat_panel( 4, "HelmReleases Ready", GITOPS_HELM_READY_PCT, {"h": 4, "w": 4, "x": 16, "y": 0}, unit="percent", decimals=1, thresholds={ "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "yellow", "value": 99}, {"color": "blue", "value": 100}, ], }, ), stat_panel( 5, "HelmReleases Suspended", GITOPS_HELM_SUSPENDED, {"h": 4, "w": 4, "x": 20, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "blue", "value": None}, {"color": "red", "value": 1}, ], }, ), stat_panel( 6, "GitOps Exporter", None, {"h": 4, "w": 8, "x": 0, "y": 4}, text_mode="name_and_value", targets=[ {"expr": GITOPS_SCRAPE_SUCCESS, "refId": "A", "legendFormat": "Scrape Success", "instant": True}, {"expr": GITOPS_LAST_SCRAPE_AGE, "refId": "B", "legendFormat": "Sample Age", "instant": True}, ], field_overrides=[ {"matcher": {"id": "byName", "options": "Sample Age"}, "properties": [{"id": "unit", "value": "s"}]}, { "matcher": {"id": "byName", "options": "Scrape Success"}, "properties": [ { "id": "thresholds", "value": { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "blue", "value": 1}, ], }, } ], }, ], thresholds={ "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "blue", "value": 1}, ], }, ), state_timeline_panel( 7, "Readiness History", ( f'label_replace({GITOPS_KUSTOMIZATION_READY_PCT}, "kind", "Kustomizations", "__name__", ".*") ' f'or label_replace({GITOPS_HELM_READY_PCT}, "kind", "HelmReleases", "__name__", ".*")' ), {"h": 4, "w": 16, "x": 8, "y": 4}, thresholds=gitops_success_thresholds, legend="{{kind}}", description="Ready percentage over time for Flux Kustomizations and HelmReleases.", ), table_panel( 8, "Flux Sources", source_table, {"h": 8, "w": 24, "x": 0, "y": 8}, instant=True, format="table", transformations=[{"id": "labelsToFields", "options": {}}], field_overrides=gitops_value_overrides, description="A Value of 1 means Ready; 0 means not Ready.", ), table_panel( 9, "Kustomizations", kustomization_table, {"h": 12, "w": 24, "x": 0, "y": 16}, instant=True, format="table", transformations=[{"id": "labelsToFields", "options": {}}], field_overrides=gitops_value_overrides, description="A Value of 1 means Ready; 0 means not Ready. The ready/reason labels come from Flux status.conditions.", ), table_panel( 10, "HelmReleases", helm_table, {"h": 12, "w": 24, "x": 0, "y": 28}, instant=True, format="table", transformations=[{"id": "labelsToFields", "options": {}}], field_overrides=gitops_value_overrides, description="A Value of 1 means Ready; 0 means not Ready. Chart/version/app_version are included when Flux reports them.", ), ] return { "uid": "atlas-gitops", "title": "Atlas GitOps", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "gitops", "flux"], } def build_power_dashboard(): panels = [] status_mapping = [ { "type": "value", "options": { "0": {"text": "⚡ Charging"}, "1": {"text": "🔋 Discharging"}, }, } ] panels.append( stat_panel( 1, "UPS Current Load", None, {"h": 8, "w": 12, "x": 0, "y": 0}, unit="none", decimals=1, text_mode="name_and_value", targets=[ {"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Draw (W)", "instant": True}, {"refId": "B", "expr": ANANKE_UPS_RUNTIME_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Discharge", "instant": True}, {"refId": "C", "expr": ANANKE_UPS_ON_BATTERY_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Status", "instant": True}, {"refId": "D", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Draw (W)", "instant": True}, {"refId": "E", "expr": ANANKE_UPS_RUNTIME_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Discharge", "instant": True}, {"refId": "F", "expr": ANANKE_UPS_ON_BATTERY_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Status", "instant": True}, ], field_overrides=[ { "matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Draw (W)"}, "properties": [{"id": "unit", "value": "watt"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}], }, { "matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Draw (W)"}, "properties": [{"id": "unit", "value": "watt"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}], }, { "matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Discharge"}, "properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}], }, { "matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Discharge"}, "properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}], }, { "matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Status"}, "properties": [{"id": "mappings", "value": status_mapping}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}], }, { "matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Status"}, "properties": [{"id": "mappings", "value": status_mapping}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}], }, ], orientation="horizontal", wide_layout=True, description=( "Per-UPS live snapshot: current draw in watts, estimated battery runtime if discharge started now, and charging/discharging status." ), ) ) panels.append( apply_bar_timeseries_style( timeseries_panel( 2, "UPS History (Power Draw)", None, {"h": 8, "w": 12, "x": 12, "y": 0}, unit="watt", targets=[ {"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB_SERIES, "legendFormat": ANANKE_UPS_DB_NAME}, {"refId": "B", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES, "legendFormat": ANANKE_UPS_TETHYS_NAME}, ], field_overrides=fixed_color_overrides( {ANANKE_UPS_DB_NAME: "dark-blue", ANANKE_UPS_TETHYS_NAME: "dark-yellow"} ), legend_display="table", legend_placement="right", description="Historical UPS power consumption in watts for titan-db and tethys.", ), stacked=False, ) ) panels.append( stat_panel( 3, "Current Climate", None, {"h": 8, "w": 12, "x": 0, "y": 8}, unit="none", decimals=2, text_mode="name_and_value", targets=[ {"refId": "A", "expr": CLIMATE_TEMP_MAX, "legendFormat": "Tent Temp (°C)", "instant": True}, {"refId": "B", "expr": CLIMATE_PRESSURE_CURRENT, "legendFormat": "Tent VPD (kPa)", "instant": True}, {"refId": "C", "expr": CLIMATE_HUMIDITY_MAX, "legendFormat": "Tent RH (%)", "instant": True}, {"refId": "D", "expr": CLIMATE_DEWPOINT_CURRENT, "legendFormat": "Dew Point (°C)", "instant": True}, ], field_overrides=[ {"matcher": {"id": "byName", "options": "Tent Temp (°C)"}, "properties": [{"id": "unit", "value": "celsius"}]}, {"matcher": {"id": "byName", "options": "Tent VPD (kPa)"}, "properties": [{"id": "unit", "value": "suffix:kPa"}]}, {"matcher": {"id": "byName", "options": "Tent RH (%)"}, "properties": [{"id": "unit", "value": "percent"}]}, {"matcher": {"id": "byName", "options": "Dew Point (°C)"}, "properties": [{"id": "unit", "value": "celsius"}]}, ], orientation="horizontal", wide_layout=True, description="Current tent temperature, humidity, VPD, and dew point. These render once Typhon climate telemetry is online.", ) ) panels.append( timeseries_panel( 4, "Climate History", None, {"h": 8, "w": 12, "x": 12, "y": 8}, unit="celsius", targets=[ {"refId": "A", "expr": CLIMATE_TEMP_SERIES, "legendFormat": "Temperature (°C)"}, {"refId": "B", "expr": CLIMATE_HUMIDITY_SERIES, "legendFormat": "Humidity (%)"}, {"refId": "C", "expr": CLIMATE_PRESSURE_SERIES, "legendFormat": "VPD (kPa)"}, {"refId": "D", "expr": CLIMATE_DEWPOINT_SERIES, "legendFormat": "Dew Point (°C)"}, ], field_overrides=[ { "matcher": {"id": "byName", "options": "Humidity (%)"}, "properties": [ {"id": "unit", "value": "percent"}, ], }, { "matcher": {"id": "byName", "options": "VPD (kPa)"}, "properties": [ {"id": "unit", "value": "none"}, {"id": "custom.axisPlacement", "value": "right"}, {"id": "custom.axisLabel", "value": "kPa"}, {"id": "decimals", "value": 2}, ], } ], legend_display="table", legend_placement="right", description="Two-axis chart: tent temperature/humidity/dew point (left axis) and tent VPD in kPa (right axis).", ) ) panels.append( stat_panel( 5, "Fan Activity", None, {"h": 8, "w": 12, "x": 0, "y": 16}, unit="none", decimals=0, text_mode="name_and_value", targets=[ {"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Inside Outlet", "instant": True}, {"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "Inside Inlet", "instant": True}, {"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Outside Inlet", "instant": True}, {"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior Fans", "instant": True}, ], thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 7}, {"color": "red", "value": 9}, ], }, orientation="horizontal", wide_layout=True, description="Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans.", ) ) panels.append( timeseries_panel( 6, "Fan Intensity History", None, {"h": 8, "w": 12, "x": 12, "y": 16}, unit="none", max_value=10, targets=[ {"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Inside Outlet"}, {"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "Inside Inlet"}, {"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Outside Inlet"}, {"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior Fans"}, ], legend_display="table", legend_placement="right", description="Historical fan activity for all four fan groups (0-10 scale).", ) ) return { "uid": "atlas-power", "title": "Atlas Power", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-24h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "power", "climate"], } def build_gpu_dashboard(): panels = [] gpu_scope = "$namespace_scope_gpu" panels.append( pie_panel( 1, "Namespace GPU Share", namespace_gpu_share_expr(gpu_scope), {"h": 8, "w": 12, "x": 0, "y": 0}, links=namespace_scope_links("namespace_scope_gpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) ) panels.append( timeseries_panel( 2, "GPU Util by Namespace", namespace_gpu_usage_instant(gpu_scope), {"h": 8, "w": 12, "x": 12, "y": 0}, unit="percent", legend="{{namespace}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 3, "GPU Util by Node", gpu_util_by_hostname(), {"h": 8, "w": 12, "x": 0, "y": 8}, unit="percent", legend="{{Hostname}}", legend_display="table", legend_placement="right", ) ) panels.append( table_panel( 4, "Top Pods by GPU Util", 'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))', {"h": 8, "w": 12, "x": 12, "y": 8}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}], ) ) return { "uid": "atlas-gpu", "title": "Atlas GPU", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "gpu"], "templating": { "list": [ namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"), namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"), namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"), ] }, } DASHBOARDS = { "atlas-overview": { "builder": build_overview, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml", }, "atlas-pods": { "builder": build_pods_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml", }, "atlas-nodes": { "builder": build_nodes_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml", }, "atlas-storage": { "builder": build_storage_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml", }, "atlas-network": { "builder": build_network_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml", }, "atlas-mail": { "builder": build_mail_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml", }, "atlas-testing": { "builder": build_testing_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml", }, "atlas-gitops": { "builder": build_gitops_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gitops.yaml", }, "atlas-power": { "builder": build_power_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-power.yaml", }, "atlas-gpu": { "builder": build_gpu_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml", }, } def write_json(uid, data): DASHBOARD_DIR.mkdir(parents=True, exist_ok=True) path = DASHBOARD_DIR / f"{uid}.json" data = apply_global_status_palette(data) path.write_text(json.dumps(data, indent=2) + "\n") def render_configmap(uid, info): json_path = DASHBOARD_DIR / f"{uid}.json" payload = json.dumps(apply_global_status_palette(json.loads(json_path.read_text())), indent=2) indented = "\n".join(" " + line for line in payload.splitlines()) output_path = info["configmap"] content = CONFIG_TEMPLATE.format( relative_path=output_path.relative_to(ROOT), name=output_path.stem, key=json_path.name, payload=indented, ) output_path.write_text(content) print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}") def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders") args = parser.parse_args() if args.build: for uid, info in DASHBOARDS.items(): write_json(uid, info["builder"]()) for uid, info in DASHBOARDS.items(): render_configmap(uid, info) if __name__ == "__main__": main()