5522 lines
206 KiB
Python
5522 lines
206 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate Atlas Grafana dashboards and render them into ConfigMaps.
|
|
|
|
Usage:
|
|
scripts/dashboards_render_atlas.py --build # rebuild JSON + ConfigMaps
|
|
scripts/dashboards_render_atlas.py # re-render ConfigMaps from JSON
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import textwrap
|
|
import urllib.parse
|
|
from pathlib import Path
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Paths, folders, and shared metadata
|
|
# ---------------------------------------------------------------------------
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards"
|
|
CONFIG_TEMPLATE = textwrap.dedent(
|
|
"""# {relative_path}
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: {name}
|
|
labels:
|
|
grafana_dashboard: "1"
|
|
data:
|
|
{key}: |
|
|
{payload}
|
|
"""
|
|
)
|
|
|
|
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
|
|
PUBLIC_FOLDER = "overview"
|
|
PUBLIC_DASHBOARD_FOLDER = "atlas-public"
|
|
PRIVATE_FOLDER = "atlas-internal"
|
|
ASTRAIOS_MOUNTPOINT = "/mnt/astraios"
|
|
GLOBAL_STATUS_COLOR_TONES = {
|
|
"blue": "dark-blue",
|
|
"green": "dark-green",
|
|
"yellow": "dark-yellow",
|
|
"orange": "dark-orange",
|
|
"red": "dark-red",
|
|
}
|
|
COLOR_VALUE_KEYS = {"color", "fixedColor"}
|
|
|
|
|
|
def apply_global_status_palette(value, parent_key=None):
|
|
"""Normalize generated Grafana status colors to the shared Atlas tones."""
|
|
if isinstance(value, dict):
|
|
return {key: apply_global_status_palette(item, key) for key, item in value.items()}
|
|
if isinstance(value, list):
|
|
return [apply_global_status_palette(item, parent_key) for item in value]
|
|
if parent_key in COLOR_VALUE_KEYS and isinstance(value, str):
|
|
return GLOBAL_STATUS_COLOR_TONES.get(value, value)
|
|
return value
|
|
|
|
|
|
def latest_suite_value(
|
|
selector: str, window: str, success_selector: str | None = None
|
|
) -> str:
|
|
"""Return the newest sample per suite, ignoring stale Pushgateway scrape instances."""
|
|
sample = f"last_over_time({selector}[{window}])"
|
|
sample_time = f"tlast_over_time({selector}[{window}])"
|
|
if success_selector:
|
|
successful_run = f"(last_over_time({success_selector}[{window}]) > 0)"
|
|
sample = f"(({sample}) and on(suite) ({successful_run}))"
|
|
sample_time = f"(({sample_time}) and on(suite) ({successful_run}))"
|
|
latest_series = f"topk by (suite) (1, {sample_time})"
|
|
return f"max by (suite) (({sample}) and ({latest_series}))"
|
|
|
|
|
|
def deduped_counter_increase(selector: str, window: str, step: str = "1m") -> str:
|
|
"""Increase after collapsing Pushgateway scrape identity labels."""
|
|
return f"increase((max without(instance, job) ({selector}))[{window}:{step}])"
|
|
|
|
|
|
def platform_runs_increase(label_selector: str, window: str, step: str = "1m") -> str:
|
|
"""Return a scrape-deduped increase for platform quality run gauges."""
|
|
return deduped_counter_increase(
|
|
f"platform_quality_gate_runs_total{{{label_selector}}}",
|
|
window,
|
|
step,
|
|
)
|
|
|
|
PERCENT_THRESHOLDS = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "orange", "value": 75},
|
|
{"color": "red", "value": 91.5},
|
|
],
|
|
}
|
|
|
|
NAMESPACE_CPU_WINDOW = "1m"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cluster metadata
|
|
# ---------------------------------------------------------------------------
|
|
|
|
CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"]
|
|
CONTROL_DEPENDENCIES = ["titan-db", "titan-jh"]
|
|
CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
|
|
WORKER_NODES = [
|
|
"titan-04",
|
|
"titan-05",
|
|
"titan-06",
|
|
"titan-07",
|
|
"titan-08",
|
|
"titan-09",
|
|
"titan-10",
|
|
"titan-11",
|
|
"titan-20",
|
|
"titan-21",
|
|
"titan-12",
|
|
"titan-13",
|
|
"titan-14",
|
|
"titan-15",
|
|
"titan-16",
|
|
"titan-17",
|
|
"titan-18",
|
|
"titan-19",
|
|
"titan-22",
|
|
"titan-24",
|
|
]
|
|
|
|
CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES)
|
|
CONTROL_ALL_REGEX = "|".join(CONTROL_ALL)
|
|
WORKER_REGEX = "|".join(WORKER_NODES)
|
|
CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
|
|
WORKER_TOTAL = len(WORKER_NODES)
|
|
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
|
|
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
|
|
# Namespaces considered infrastructure (excluded from workload counts)
|
|
INFRA_PATTERNS = [
|
|
"kube-.*",
|
|
".*-system",
|
|
"traefik",
|
|
"monitoring",
|
|
"logging",
|
|
"cert-manager",
|
|
"maintenance",
|
|
"postgres",
|
|
]
|
|
INFRA_REGEX = f"^({'|'.join(INFRA_PATTERNS)})$"
|
|
# Namespaces allowed on control plane without counting as workloads
|
|
CP_ALLOWED_NS = INFRA_REGEX
|
|
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
|
|
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
|
|
CONTROL_WORKLOADS_EXPR = (
|
|
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
|
|
)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# PromQL helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")'
|
|
|
|
|
|
def node_filter(regex):
|
|
"""Return a selector that evaluates to 1 for nodes matching the regex."""
|
|
return (
|
|
f'label_replace(node_uname_info{{nodename=~"{regex}"}}, '
|
|
'"node", "$1", "nodename", "(.*)")'
|
|
)
|
|
|
|
|
|
def scoped_node_expr(base, scope=""):
|
|
"""Attach nodename metadata and optionally filter to a scope regex."""
|
|
expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})"
|
|
if scope:
|
|
expr = f"({expr}) * on(node) group_left() {node_filter(scope)}"
|
|
return expr
|
|
|
|
|
|
def node_cpu_expr(scope=""):
|
|
idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))'
|
|
base = f"(1 - {idle}) * 100"
|
|
return scoped_node_expr(base, scope)
|
|
|
|
|
|
def node_mem_expr(scope=""):
|
|
usage = (
|
|
"avg by (instance) ("
|
|
"(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) "
|
|
"/ node_memory_MemTotal_bytes * 100)"
|
|
)
|
|
return scoped_node_expr(usage, scope)
|
|
|
|
|
|
def filesystem_usage_expr(mount, scope=""):
|
|
base = (
|
|
f'avg by (instance) ('
|
|
f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} '
|
|
f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)'
|
|
)
|
|
return scoped_node_expr(base, scope)
|
|
|
|
|
|
def root_usage_expr(scope=""):
|
|
return filesystem_usage_expr("/", scope)
|
|
|
|
|
|
def astraios_usage_expr(scope=""):
|
|
return filesystem_usage_expr(ASTRAIOS_MOUNTPOINT, scope)
|
|
|
|
|
|
def astreae_usage_expr(mount):
|
|
return (
|
|
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
|
|
f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
|
|
)
|
|
|
|
|
|
def astreae_free_expr(mount):
|
|
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
|
|
|
|
|
|
def topk_with_node(expr):
|
|
return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")'
|
|
|
|
|
|
def node_net_expr(scope=""):
|
|
base = (
|
|
'sum by (instance) ('
|
|
'rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
|
|
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))'
|
|
)
|
|
return scoped_node_expr(base, scope)
|
|
|
|
|
|
def node_io_expr(scope=""):
|
|
base = (
|
|
"sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
|
|
"+ rate(node_disk_written_bytes_total[5m]))"
|
|
)
|
|
return scoped_node_expr(base, scope)
|
|
|
|
|
|
def namespace_selector(scope_var):
|
|
return f'namespace!="",pod!="",container!="",container!="POD",{scope_var}'
|
|
|
|
|
|
def namespace_gpu_selector(scope_var):
|
|
return f'namespace!="",pod!="",{scope_var}'
|
|
|
|
|
|
def namespace_cpu_raw(scope_var):
|
|
return (
|
|
"sum(rate(container_cpu_usage_seconds_total"
|
|
f"{{{namespace_selector(scope_var)}}}[{NAMESPACE_CPU_WINDOW}])) by (namespace)"
|
|
)
|
|
|
|
|
|
def namespace_ram_raw(scope_var):
|
|
return f"sum(container_memory_working_set_bytes{{{namespace_selector(scope_var)}}}) by (namespace)"
|
|
|
|
|
|
def namespace_gpu_usage_instant(scope_var):
|
|
return gpu_usage_by_namespace(scope_var)
|
|
|
|
|
|
def jetson_gpu_util_by_node():
|
|
return 'max by (node) (jetson_gr3d_freq_percent{node!=""})'
|
|
|
|
|
|
def dcgm_gpu_util_by_node():
|
|
dcgm_pod = 'label_replace(DCGM_FI_DEV_GPU_UTIL, "pod", "$1", "Hostname", "(.*)")'
|
|
dcgm_ns = 'label_replace(' + dcgm_pod + ', "namespace", "monitoring", "", "")'
|
|
return (
|
|
"avg by (node) ("
|
|
f"{dcgm_ns} * on(namespace,pod) group_left(node) "
|
|
'kube_pod_info{namespace="monitoring"}'
|
|
")"
|
|
)
|
|
|
|
|
|
def gpu_util_by_node():
|
|
return f"{dcgm_gpu_util_by_node()} or {jetson_gpu_util_by_node()}"
|
|
|
|
|
|
def gpu_util_by_hostname():
|
|
return 'label_replace(' + gpu_util_by_node() + ', "Hostname", "$1", "node", "(.*)")'
|
|
|
|
|
|
GPU_RESOURCE_REGEX = "nvidia(_com_|[.]com/)gpu.*"
|
|
|
|
|
|
def gpu_node_labels():
|
|
return f'max by (node) (kube_node_status_allocatable{{resource=~"{GPU_RESOURCE_REGEX}"}} > bool 0)'
|
|
|
|
|
|
def gpu_requests_by_namespace_node(scope_var):
|
|
return (
|
|
"sum by (namespace,node) ("
|
|
f'kube_pod_container_resource_requests{{resource=~"{GPU_RESOURCE_REGEX}",{scope_var}}} '
|
|
"* on(namespace,pod) group_left(node) kube_pod_info "
|
|
f"* on(node) group_left() ({gpu_node_labels()})"
|
|
")"
|
|
)
|
|
|
|
|
|
def gpu_requests_by_namespace(scope_var):
|
|
return f"sum by (namespace) ({gpu_requests_by_namespace_node(scope_var)})"
|
|
|
|
|
|
def dcgm_gpu_util_metric(scope_var):
|
|
return f'DCGM_FI_DEV_GPU_UTIL{{pod!="",namespace!="",{scope_var}}}'
|
|
|
|
|
|
def dcgm_gpu_namespace_count_by_uuid(scope_var):
|
|
dcgm = dcgm_gpu_util_metric(scope_var)
|
|
return f"count by (UUID) (count by (UUID,namespace) ({dcgm}))"
|
|
|
|
|
|
def dcgm_gpu_utilization_by_namespace(scope_var):
|
|
dcgm = dcgm_gpu_util_metric(scope_var)
|
|
namespace_count = dcgm_gpu_namespace_count_by_uuid(scope_var)
|
|
unambiguous = (
|
|
"sum by (namespace) ("
|
|
"avg_over_time(("
|
|
f"(max by (UUID,namespace) ({dcgm})) "
|
|
f"and on(UUID) ({namespace_count} == 1)"
|
|
")[$__range:$__interval])"
|
|
")"
|
|
)
|
|
shared = (
|
|
'label_replace(sum(avg_over_time(('
|
|
f"(max by (UUID) ({dcgm})) "
|
|
f"and on(UUID) ({namespace_count} > 1)"
|
|
')[$__range:$__interval])), "namespace", "shared", "", "")'
|
|
)
|
|
return f"({unambiguous}) or ({shared})"
|
|
|
|
|
|
def dcgm_gpu_utilization_present(scope_var):
|
|
dcgm = dcgm_gpu_util_metric(scope_var)
|
|
return f"(sum(max_over_time((max by (UUID) ({dcgm}))[$__range:$__interval])) or on() vector(0))"
|
|
|
|
|
|
def gpu_usage_by_namespace(scope_var):
|
|
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
|
|
total_by_node = f"sum by (node) ({requests_by_ns})"
|
|
return (
|
|
"sum by (namespace) ("
|
|
f"({requests_by_ns}) / on(node) group_left() clamp_min({total_by_node}, 1) "
|
|
f"* on(node) group_left() ({gpu_util_by_node()})"
|
|
")"
|
|
)
|
|
|
|
|
|
def jetson_gpu_usage_by_namespace(scope_var):
|
|
requests_by_ns = gpu_requests_by_namespace_node(scope_var)
|
|
total_by_node = f"sum by (node) ({requests_by_ns})"
|
|
return (
|
|
"sum by (namespace) ("
|
|
f"({requests_by_ns}) / on(node) group_left() clamp_min({total_by_node}, 1) "
|
|
f"* on(node) group_left() {jetson_gpu_util_by_node()}"
|
|
")"
|
|
)
|
|
|
|
|
|
def namespace_share_expr(resource_expr):
|
|
total = f"clamp_min(sum( {resource_expr} ), 1)"
|
|
return f"100 * ( {resource_expr} ) / {total}"
|
|
|
|
|
|
def namespace_cpu_share_expr(scope_var):
|
|
return namespace_share_expr(namespace_cpu_raw(scope_var))
|
|
|
|
|
|
def namespace_ram_share_expr(scope_var):
|
|
return namespace_share_expr(namespace_ram_raw(scope_var))
|
|
|
|
|
|
def namespace_gpu_share_expr(scope_var):
|
|
utilization = dcgm_gpu_utilization_by_namespace(scope_var)
|
|
total = f"(sum({utilization}) or on() vector(0))"
|
|
present = dcgm_gpu_utilization_present(scope_var)
|
|
share = f"100 * ({utilization}) / clamp_min({total}, 1)"
|
|
idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + present + " == 0)"
|
|
return f"({share}) or ({idle})"
|
|
|
|
|
|
PROBLEM_PODS_EXPR = (
|
|
'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) '
|
|
"or on() vector(0)"
|
|
)
|
|
CRASHLOOP_EXPR = (
|
|
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
|
|
'{reason=~"CrashLoopBackOff|ImagePullBackOff"})) '
|
|
"or on() vector(0)"
|
|
)
|
|
STUCK_TERMINATING_EXPR = (
|
|
'sum(max by (namespace,pod) ('
|
|
'((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
|
|
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
|
|
')) '
|
|
"or on() vector(0)"
|
|
)
|
|
UPTIME_WINDOW = "365d"
|
|
# vmalert precomputes the expensive long-window rollup so Grafana only reads one compact series.
|
|
UPTIME_RECORDING_METRIC = f'atlas:availability:ratio_{UPTIME_WINDOW}{{scope="atlas"}}'
|
|
UPTIME_RECORDING_EXPR = f"last_over_time({UPTIME_RECORDING_METRIC}[24h])"
|
|
TRAEFIK_READY_EXPR = (
|
|
"("
|
|
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
|
|
" / clamp_min("
|
|
'sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)'
|
|
")"
|
|
)
|
|
CONTROL_READY_FRACTION_EXPR = (
|
|
f"(sum(kube_node_status_condition{{condition=\"Ready\",status=\"true\",node=~\"{CONTROL_REGEX}\"}})"
|
|
f" / {CONTROL_TOTAL})"
|
|
)
|
|
UPTIME_AVAIL_EXPR = (
|
|
f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))"
|
|
)
|
|
|
|
# Tie-breaker to deterministically pick one node per namespace when shares tie.
|
|
NODE_TIEBREAKER = " + ".join(
|
|
f"({node_filter(node)}) * 1e-6 * {idx}"
|
|
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
|
|
)
|
|
UPTIME_AVG_EXPR = UPTIME_RECORDING_EXPR
|
|
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
|
|
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
|
|
UPTIME_THRESHOLDS = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "yellow", "value": 3},
|
|
{"color": "green", "value": 3.5},
|
|
],
|
|
}
|
|
UPTIME_PERCENT_THRESHOLDS = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "orange", "value": 0.99},
|
|
{"color": "yellow", "value": 0.999},
|
|
{"color": "green", "value": 0.9999},
|
|
{"color": "blue", "value": 0.99999},
|
|
],
|
|
}
|
|
PROBLEM_TABLE_EXPR = (
|
|
"(time() - kube_pod_created{pod!=\"\"}) "
|
|
"* on(namespace,pod) group_left(node) kube_pod_info "
|
|
"* on(namespace,pod) group_left(phase) "
|
|
"max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
|
|
)
|
|
CRASHLOOP_TABLE_EXPR = (
|
|
"(time() - kube_pod_created{pod!=\"\"}) "
|
|
"* on(namespace,pod) group_left(node) kube_pod_info "
|
|
"* on(namespace,pod,container) group_left(reason) "
|
|
"max by (namespace,pod,container,reason) "
|
|
"(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
|
|
)
|
|
STUCK_TABLE_EXPR = (
|
|
"("
|
|
"((time() - kube_pod_deletion_timestamp{pod!=\"\"}) "
|
|
"and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) "
|
|
"* on(namespace,pod) group_left(node) kube_pod_info"
|
|
")"
|
|
)
|
|
|
|
NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"'
|
|
NAMESPACE_SCOPE_ALL = 'namespace=~".*"'
|
|
NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"'
|
|
NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"]
|
|
|
|
|
|
def promql_task_regex(tasks):
|
|
"""Return a PromQL-safe regex alternation for the provided task names."""
|
|
return "|".join(tasks)
|
|
|
|
|
|
ARIADNE_ALL_SCHEDULE_TASKS = [
|
|
"schedule.mailu_sync",
|
|
"schedule.nextcloud_sync",
|
|
"schedule.nextcloud_cron",
|
|
"schedule.nextcloud_maintenance",
|
|
"schedule.vaultwarden_sync",
|
|
"schedule.wger_user_sync",
|
|
"schedule.wger_admin",
|
|
"schedule.firefly_user_sync",
|
|
"schedule.firefly_cron",
|
|
"schedule.vault_k8s_auth",
|
|
"schedule.vault_oidc",
|
|
"schedule.comms_guest_name",
|
|
"schedule.comms_pin_invite",
|
|
"schedule.comms_reset_room",
|
|
"schedule.comms_seed_room",
|
|
"schedule.pod_cleaner",
|
|
"schedule.opensearch_prune",
|
|
"schedule.image_sweeper",
|
|
"schedule.metis_k3s_token_sync",
|
|
"schedule.platform_quality_suite_probe",
|
|
]
|
|
ARIADNE_FAST_SCHEDULE_TASKS = [
|
|
task
|
|
for task in ARIADNE_ALL_SCHEDULE_TASKS
|
|
if task not in {"schedule.comms_pin_invite", "schedule.comms_reset_room"}
|
|
]
|
|
ARIADNE_SCHEDULE_HEALTH_TASKS = [
|
|
"schedule.nextcloud_sync",
|
|
"schedule.nextcloud_cron",
|
|
"schedule.vaultwarden_sync",
|
|
"schedule.wger_user_sync",
|
|
"schedule.firefly_user_sync",
|
|
"schedule.comms_guest_name",
|
|
"schedule.comms_seed_room",
|
|
"schedule.pod_cleaner",
|
|
"schedule.image_sweeper",
|
|
"schedule.metis_k3s_token_sync",
|
|
"schedule.platform_quality_suite_probe",
|
|
]
|
|
ARIADNE_ALL_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_ALL_SCHEDULE_TASKS)})$"'
|
|
ARIADNE_FAST_SCHEDULE_FILTER = f'task=~"^({promql_task_regex(ARIADNE_FAST_SCHEDULE_TASKS)})$"'
|
|
ARIADNE_SCHEDULE_HEALTH_FILTER = f'task=~"^({promql_task_regex(ARIADNE_SCHEDULE_HEALTH_TASKS)})$"'
|
|
ARIADNE_ALL_SCHEDULE_NEXT_RUN = f"ariadne_schedule_next_run_timestamp_seconds{{{ARIADNE_ALL_SCHEDULE_FILTER}}}"
|
|
ARIADNE_ALL_SCHEDULE_LAST_SUCCESS = (
|
|
f"ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_ALL_SCHEDULE_FILTER}}}"
|
|
)
|
|
ARIADNE_ALL_SCHEDULE_LAST_ERROR = f"ariadne_schedule_last_error_timestamp_seconds{{{ARIADNE_ALL_SCHEDULE_FILTER}}}"
|
|
ARIADNE_ALL_SCHEDULE_LAST_STATUS = f"ariadne_schedule_last_status{{{ARIADNE_ALL_SCHEDULE_FILTER}}}"
|
|
ARIADNE_FAST_SCHEDULE_LAST_SUCCESS = (
|
|
f"ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_FAST_SCHEDULE_FILTER}}}"
|
|
)
|
|
ARIADNE_FAST_SCHEDULE_LAST_ERROR = f"ariadne_schedule_last_error_timestamp_seconds{{{ARIADNE_FAST_SCHEDULE_FILTER}}}"
|
|
ARIADNE_FAST_SCHEDULE_LAST_STATUS = f"ariadne_schedule_last_status{{{ARIADNE_FAST_SCHEDULE_FILTER}}}"
|
|
ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS = (
|
|
f"ariadne_schedule_last_success_timestamp_seconds{{{ARIADNE_SCHEDULE_HEALTH_FILTER}}}"
|
|
)
|
|
ARIADNE_HEALTH_SCHEDULE_LAST_STATUS = f"ariadne_schedule_last_status{{{ARIADNE_SCHEDULE_HEALTH_FILTER}}}"
|
|
ARIADNE_SCHEDULE_LAST_SUCCESS_AGE = f"(time() - {ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS})"
|
|
ARIADNE_SCHEDULE_LAST_ERROR_AGE = f"(time() - {ARIADNE_ALL_SCHEDULE_LAST_ERROR})"
|
|
ARIADNE_SCHEDULE_LAST_SUCCESS_AGE_HOURS = f"({ARIADNE_SCHEDULE_LAST_SUCCESS_AGE}) / 3600"
|
|
ARIADNE_SCHEDULE_LAST_ERROR_AGE_HOURS = f"({ARIADNE_SCHEDULE_LAST_ERROR_AGE}) / 3600"
|
|
ARIADNE_SCHEDULE_STALE_WINDOW_SEC = 36 * 3600
|
|
ARIADNE_SCHEDULE_STALE = f"(({ARIADNE_SCHEDULE_LAST_SUCCESS_AGE}) > bool {ARIADNE_SCHEDULE_STALE_WINDOW_SEC})"
|
|
ARIADNE_SCHEDULE_MISSING = (
|
|
f"({ARIADNE_ALL_SCHEDULE_NEXT_RUN} unless on(task) {ARIADNE_HEALTH_SCHEDULE_LAST_SUCCESS})"
|
|
)
|
|
ARIADNE_SCHEDULE_FAILED = f"((1 - {ARIADNE_HEALTH_SCHEDULE_LAST_STATUS}) > bool 0)"
|
|
ARIADNE_SCHEDULE_STALE_COUNT = f"sum({ARIADNE_SCHEDULE_STALE}) or on() vector(0)"
|
|
ARIADNE_SCHEDULE_MISSING_COUNT = f"count({ARIADNE_SCHEDULE_MISSING}) or on() vector(0)"
|
|
ARIADNE_SCHEDULE_FAILED_COUNT = f"sum({ARIADNE_SCHEDULE_FAILED}) or on() vector(0)"
|
|
ARIADNE_TASK_ERRORS_RANGE = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[$__range]))'
|
|
ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))'
|
|
ARIADNE_TASK_ERRORS_1H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[1h]))'
|
|
ARIADNE_TASK_ERRORS_30D = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[30d]))'
|
|
ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))'
|
|
ARIADNE_TASK_RUNS_BY_STATUS_1H = 'sum by (status) (increase(ariadne_task_runs_total[1h]))'
|
|
ARIADNE_TASK_ERRORS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[1h]))'
|
|
ARIADNE_TASK_ERRORS_24H_TOTAL = 'sum(increase(ariadne_task_runs_total{status="error"}[24h]))'
|
|
ARIADNE_TASK_RUNS_1H_TOTAL = 'sum(increase(ariadne_task_runs_total[1h]))'
|
|
ARIADNE_TASK_ATTEMPTS_SERIES = 'sum(increase(ariadne_task_runs_total[5m]))'
|
|
ARIADNE_TASK_FAILURES_SERIES = 'sum(increase(ariadne_task_runs_total{status="error"}[5m]))'
|
|
ARIADNE_TASK_WARNINGS_SERIES = (
|
|
'sum(increase(ariadne_task_runs_total{status!~"ok|error"}[$__interval])) or on() vector(0)'
|
|
)
|
|
ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = f"(time() - {ARIADNE_ALL_SCHEDULE_LAST_SUCCESS}) / 3600"
|
|
ARIADNE_SCHEDULE_LAST_ERROR_HOURS = f"(time() - {ARIADNE_ALL_SCHEDULE_LAST_ERROR}) / 3600"
|
|
ARIADNE_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
|
|
f"(time() - max_over_time({ARIADNE_ALL_SCHEDULE_LAST_SUCCESS}[$__range])) / 3600"
|
|
)
|
|
ARIADNE_SCHEDULE_LAST_ERROR_RANGE_HOURS = (
|
|
f"(time() - max_over_time({ARIADNE_ALL_SCHEDULE_LAST_ERROR}[$__range])) / 3600"
|
|
)
|
|
ARIADNE_FAST_SCHEDULE_LAST_SUCCESS_RANGE_HOURS = (
|
|
f"(time() - max_over_time({ARIADNE_FAST_SCHEDULE_LAST_SUCCESS}[$__range])) / 3600"
|
|
)
|
|
ARIADNE_FAST_SCHEDULE_NEXT_RUN_HOURS = f"(({ARIADNE_ALL_SCHEDULE_NEXT_RUN} - time()) / 3600)"
|
|
ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total"
|
|
PLATFORM_TEST_SUITE_NAMES = [
|
|
"ariadne",
|
|
"metis",
|
|
"ananke",
|
|
"atlasbot",
|
|
"lesavka",
|
|
"pegasus",
|
|
"soteria",
|
|
"titan_iac",
|
|
"typhon",
|
|
"bstein_home",
|
|
"data_prepper",
|
|
]
|
|
PLATFORM_TEST_SUCCESS_STATUS = "ok|passed|success"
|
|
PLATFORM_TEST_NON_FAILURE_STATUS = f"{PLATFORM_TEST_SUCCESS_STATUS}|not_applicable|skipped|na|n/a"
|
|
PLATFORM_TEST_STANDARD_CHECK_REGEX = (
|
|
"tests|coverage|loc|style|docs_naming|gate_glue|sonarqube|supply_chain"
|
|
)
|
|
PLATFORM_TEST_CI_JOB = "platform-quality-ci"
|
|
PLATFORM_TEST_EXPORT_FILTER = f'exported_job="{PLATFORM_TEST_CI_JOB}"'
|
|
PLATFORM_TEST_SUITE_VALUE_BY_NAME = {
|
|
"ariadne": "ariadne",
|
|
"metis": "metis",
|
|
"ananke": "ananke",
|
|
"atlasbot": "atlasbot",
|
|
"lesavka": "lesavka",
|
|
"pegasus": "pegasus|pegasus-health|pegasus_health",
|
|
"soteria": "soteria",
|
|
"titan_iac": "titan_iac|titan-iac",
|
|
"typhon": "typhon",
|
|
"bstein_home": "bstein_home|bstein-home",
|
|
"data_prepper": "data_prepper|data-prepper",
|
|
}
|
|
PLATFORM_TEST_JENKINS_JOB_BY_SUITE = {
|
|
"ariadne": "ariadne",
|
|
"metis": "metis",
|
|
"ananke": "ananke",
|
|
"atlasbot": "atlasbot",
|
|
"lesavka": "lesavka",
|
|
"pegasus": "pegasus",
|
|
"soteria": "Soteria",
|
|
"titan_iac": "titan-iac",
|
|
"typhon": "typhon",
|
|
"bstein_home": "bstein-dev-home",
|
|
"data_prepper": "data-prepper",
|
|
}
|
|
JENKINS_UI_BASE_DEFAULT = "https://ci.bstein.dev"
|
|
PLATFORM_TEST_SUITE_MATCHER = "|".join(
|
|
PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite) for suite in PLATFORM_TEST_SUITE_NAMES
|
|
)
|
|
PLATFORM_TEST_SUITE_CANONICAL_MATCHER = "|".join(PLATFORM_TEST_SUITE_NAMES)
|
|
PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER = PLATFORM_TEST_SUITE_CANONICAL_MATCHER
|
|
PLATFORM_TEST_CATEGORY_REGEX = (
|
|
"api|chaos|compatibility|component|contract|e2e|integration|manual|"
|
|
"performance|regression|reliability|security|smoke|system|ui|unit"
|
|
)
|
|
PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX = (
|
|
"api|chaos|compatibility|component|contract|e2e|integration|"
|
|
"performance|regression|reliability|security|smoke|system|ui"
|
|
)
|
|
PLATFORM_TEST_SUPPORT_CATEGORY_REGEX = "fixtures|golden|helpers"
|
|
PLATFORM_TEST_BRANCH_OPTIONS = ("main", "master", "origin/main", "origin/master", "unknown")
|
|
PLATFORM_TEST_RUNS_24H_ROLLUP = "platform_quality:suite_runs:increase_24h"
|
|
PLATFORM_TEST_COVERAGE_ROLLUP = "platform_quality:suite_coverage_percent:latest_1h"
|
|
PLATFORM_TEST_SOURCE_FILES_ROLLUP = "platform_quality:suite_source_files_total:latest_1h"
|
|
PLATFORM_TEST_SOURCE_LINES_OVER_500_ROLLUP = (
|
|
"platform_quality:suite_source_lines_over_500_total:latest_1h"
|
|
)
|
|
PLATFORM_TEST_SONAR_HEALTH_ROLLUP = "platform_quality:sonar_gate_health_percent:latest_1h"
|
|
PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP = "platform_quality:test_category_health_rate:percent_1h"
|
|
PLATFORM_TEST_SUCCESS_EVENTS_30D = (
|
|
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")}) or on() vector(0))'
|
|
)
|
|
PLATFORM_TEST_TOTAL_EVENTS_30D = (
|
|
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")}) or on() vector(0))'
|
|
)
|
|
PLATFORM_TEST_SUCCESS_EVENTS_7D = (
|
|
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", "7d")}) or on() vector(0))'
|
|
)
|
|
PLATFORM_TEST_TOTAL_EVENTS_7D = (
|
|
f'(sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "7d")}) or on() vector(0))'
|
|
)
|
|
PLATFORM_TEST_SUCCESS_EVENTS_24H = (
|
|
f'(sum({PLATFORM_TEST_RUNS_24H_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status=~"{PLATFORM_TEST_SUCCESS_STATUS}"}}) or on() vector(0))'
|
|
)
|
|
PLATFORM_TEST_TOTAL_EVENTS_24H = (
|
|
f'(sum({PLATFORM_TEST_RUNS_24H_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}"}}) or on() vector(0))'
|
|
)
|
|
TEST_SUCCESS_RATE = (
|
|
f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_30D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_30D}), 1)"
|
|
)
|
|
TEST_SUCCESS_RATE_7D = (
|
|
f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_7D}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_7D}), 1)"
|
|
)
|
|
TEST_SUCCESS_RATE_24H = (
|
|
f"100 * ({PLATFORM_TEST_SUCCESS_EVENTS_24H}) / clamp_min(({PLATFORM_TEST_TOTAL_EVENTS_24H}), 1)"
|
|
)
|
|
TEST_FAILURES_24H_TOTAL = (
|
|
f'(sum({PLATFORM_TEST_RUNS_24H_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status!~"{PLATFORM_TEST_SUCCESS_STATUS}"}}) or on() vector(0))'
|
|
)
|
|
PLATFORM_TEST_FAILURES_24H_BY_SUITE = (
|
|
f'sort_desc(sum by (suite) ({PLATFORM_TEST_RUNS_24H_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_MATCHER}",status!~"{PLATFORM_TEST_SUCCESS_STATUS}"}}))'
|
|
)
|
|
PLATFORM_TEST_ACTIVITY_30D = (
|
|
f'sum by (suite, status) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")})'
|
|
)
|
|
PLATFORM_TEST_RUNS_24H_TOTAL = PLATFORM_TEST_TOTAL_EVENTS_24H
|
|
PLATFORM_TEST_ACTIVE_SUITES_24H = (
|
|
f'sum((sum by (suite) ({PLATFORM_TEST_RUNS_24H_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}"}}) > bool 0)) '
|
|
"or on() vector(0)"
|
|
)
|
|
PLATFORM_TEST_POINT_WINDOW = "1h"
|
|
PLATFORM_TEST_FRESH_WINDOW = "30h"
|
|
PLATFORM_TEST_LATEST_WINDOW = "30d"
|
|
|
|
|
|
def platform_check_status_expr(
|
|
suite_matcher: str,
|
|
*,
|
|
branch_matcher: str = 'branch!=""',
|
|
check_matcher: str = 'check!=""',
|
|
status_matcher: str = 'status!=""',
|
|
window: str | None = PLATFORM_TEST_FRESH_WINDOW,
|
|
) -> str:
|
|
"""Return recent check gauges normalized to a status label."""
|
|
result_matcher = status_matcher.replace("status", "result", 1)
|
|
status_guards = [status_matcher]
|
|
result_guards = [result_matcher]
|
|
if status_matcher.startswith("status!") and status_matcher != 'status!=""':
|
|
status_guards.insert(0, 'status!=""')
|
|
if result_matcher.startswith("result!") and result_matcher != 'result!=""':
|
|
result_guards.insert(0, 'result!=""')
|
|
status_selector = ",".join(status_guards)
|
|
result_selector = ",".join(result_guards)
|
|
base = (
|
|
f'__name__=~".*_quality_gate_checks_total",{suite_matcher},'
|
|
f'{PLATFORM_TEST_EXPORT_FILTER},{check_matcher}'
|
|
)
|
|
def maybe_recent(selector: str) -> str:
|
|
metric = f"{{{selector}}}"
|
|
return f"last_over_time({metric}[{window}])" if window else metric
|
|
|
|
build_info_metric = f'platform_quality_gate_build_info{{{suite_matcher},{PLATFORM_TEST_EXPORT_FILTER},{branch_matcher}}}'
|
|
build_info_sample = (
|
|
f"last_over_time({build_info_metric}[{window}])" if window else build_info_metric
|
|
)
|
|
build_info = f"max by (suite, branch) ({build_info_sample})"
|
|
with_status_selector = f"{base},{branch_matcher},{status_selector}"
|
|
with_result_selector = f'{base},{branch_matcher},{result_selector},status=""'
|
|
inferred_status_selector = f'{base},branch="",{status_selector}'
|
|
inferred_result_selector = f'{base},branch="",{result_selector},status=""'
|
|
with_status = maybe_recent(with_status_selector)
|
|
with_result = (
|
|
f'label_replace({maybe_recent(with_result_selector)}, '
|
|
f'"status", "$1", "result", "(.*)")'
|
|
)
|
|
inferred_status = (
|
|
f'({maybe_recent(inferred_status_selector)} '
|
|
f'* on (suite) group_left(branch) ({build_info}))'
|
|
)
|
|
inferred_result = (
|
|
f'(label_replace({maybe_recent(inferred_result_selector)}, '
|
|
f'"status", "$1", "result", "(.*)") '
|
|
f'* on (suite) group_left(branch) ({build_info}))'
|
|
)
|
|
return (
|
|
f"sum by (suite, branch, check, status) ("
|
|
f"{with_status} or {with_result} or {inferred_status} or {inferred_result}"
|
|
f")"
|
|
)
|
|
|
|
|
|
def platform_check_rollup_status_expr(
|
|
suite_matcher: str,
|
|
*,
|
|
branch_matcher: str = 'branch!=""',
|
|
check_matcher: str = 'check!=""',
|
|
status_matcher: str = 'status!=""',
|
|
) -> str:
|
|
"""Return memoized hourly check gauges normalized by vmalert."""
|
|
return (
|
|
"sum by (suite, branch, check, status) ("
|
|
f"platform_quality:check_status:present_1h{{{suite_matcher},{branch_matcher},"
|
|
f"{check_matcher},{status_matcher}}}"
|
|
")"
|
|
)
|
|
|
|
|
|
PLATFORM_TEST_SUCCESS_RATE_SUITE_TARGETS = [
|
|
{
|
|
"refId": chr(ord("A") + index),
|
|
"expr": (
|
|
f'(100 * (sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite)}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_POINT_WINDOW)}))) / '
|
|
f'clamp_min((sum({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_VALUE_BY_NAME.get(suite, suite)}\",{PLATFORM_TEST_EXPORT_FILTER}", PLATFORM_TEST_POINT_WINDOW)})), 1))'
|
|
),
|
|
"legendFormat": suite,
|
|
}
|
|
for index, suite in enumerate(PLATFORM_TEST_SUITE_NAMES)
|
|
]
|
|
PLATFORM_TEST_SUCCESS_RATE_24H_BY_SUITE = (
|
|
f'sort_desc(100 * (sum by (suite) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", "24h")})) '
|
|
f'/ clamp_min((sum by (suite) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "24h")})), 1))'
|
|
)
|
|
QUALITY_GATE_SUITE_INDEX_30D = (
|
|
f'sum by (suite) ({platform_runs_increase(f"suite=~\"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}\",{PLATFORM_TEST_EXPORT_FILTER}", "30d", "15m")})'
|
|
)
|
|
QUALITY_GATE_COVERAGE_BY_SUITE = (
|
|
f'max by (suite) ({PLATFORM_TEST_COVERAGE_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}"}})'
|
|
)
|
|
QUALITY_GATE_COVERAGE_BY_SUITE_WITH_MISSING = (
|
|
f"({QUALITY_GATE_COVERAGE_BY_SUITE}) or on(suite) (0 * ({QUALITY_GATE_SUITE_INDEX_30D}) - 1)"
|
|
)
|
|
QUALITY_GATE_COVERAGE_GAP_BY_SUITE = (
|
|
f"clamp_min(95 - ({QUALITY_GATE_COVERAGE_BY_SUITE}), 0)"
|
|
)
|
|
QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE = (
|
|
f'max by (suite) ({PLATFORM_TEST_SOURCE_LINES_OVER_500_ROLLUP}{{suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}"}})'
|
|
)
|
|
QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE_WITH_MISSING = (
|
|
f"({QUALITY_GATE_SMELL_INFRACTIONS_BY_SUITE}) or on(suite) (0 * ({QUALITY_GATE_SUITE_INDEX_30D}) - 1)"
|
|
)
|
|
PLATFORM_TEST_CHECKS_SELECTOR = (
|
|
f'__name__=~".*_quality_gate_checks_total",suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",'
|
|
f"{PLATFORM_TEST_EXPORT_FILTER}"
|
|
)
|
|
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS = (
|
|
f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}"'
|
|
)
|
|
PLATFORM_TEST_PRIMARY_BRANCH_MATCHER = 'branch=~"main|master|origin/main|origin/master"'
|
|
PLATFORM_TEST_CHECK_ROLLUP_SELECTOR = platform_check_rollup_status_expr(
|
|
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS,
|
|
branch_matcher=PLATFORM_TEST_PRIMARY_BRANCH_MATCHER,
|
|
check_matcher=f'check=~"{PLATFORM_TEST_STANDARD_CHECK_REGEX}"',
|
|
)
|
|
PLATFORM_TEST_CHECK_ROLLUP_OK_SELECTOR = (
|
|
platform_check_rollup_status_expr(
|
|
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS,
|
|
branch_matcher=PLATFORM_TEST_PRIMARY_BRANCH_MATCHER,
|
|
check_matcher=f'check=~"{PLATFORM_TEST_STANDARD_CHECK_REGEX}"',
|
|
status_matcher=f'status=~"{PLATFORM_TEST_NON_FAILURE_STATUS}"',
|
|
)
|
|
)
|
|
PLATFORM_TEST_CHECK_ROLLUP_FAILED_SELECTOR = (
|
|
platform_check_rollup_status_expr(
|
|
PLATFORM_TEST_CHECK_ROLLUP_MATCHERS,
|
|
branch_matcher=PLATFORM_TEST_PRIMARY_BRANCH_MATCHER,
|
|
check_matcher=f'check=~"{PLATFORM_TEST_STANDARD_CHECK_REGEX}"',
|
|
status_matcher=f'status!~"{PLATFORM_TEST_NON_FAILURE_STATUS}"',
|
|
)
|
|
)
|
|
PLATFORM_TEST_CHECK_ROLLUP_SEEN_FLAGS = (
|
|
f'clamp_max(max by (suite, check) (({PLATFORM_TEST_CHECK_ROLLUP_SELECTOR}) > 0), 1)'
|
|
)
|
|
PLATFORM_TEST_CHECK_ROLLUP_FAILED_FLAGS = (
|
|
f'clamp_max(max by (suite, check) (({PLATFORM_TEST_CHECK_ROLLUP_FAILED_SELECTOR}) > 0), 1)'
|
|
)
|
|
PLATFORM_TEST_CHECK_ROLLUP_OK_FLAGS = (
|
|
f'clamp_max(max by (suite, check) (({PLATFORM_TEST_CHECK_ROLLUP_OK_SELECTOR}) > 0), 1) '
|
|
f'unless on(suite, check) ({PLATFORM_TEST_CHECK_ROLLUP_FAILED_FLAGS})'
|
|
)
|
|
PLATFORM_TEST_CURRENT_GATE_CHECK_HEALTH_BY_SUITE = (
|
|
f'(100 * sum by (suite) ({PLATFORM_TEST_CHECK_ROLLUP_OK_FLAGS}) '
|
|
f'/ clamp_min(sum by (suite) ({PLATFORM_TEST_CHECK_ROLLUP_SEEN_FLAGS}), 1))'
|
|
)
|
|
PLATFORM_TEST_CATEGORY_HEALTH_BY_SUITE = (
|
|
f'min by (suite) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{'
|
|
f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",branch!="",'
|
|
f'branch=~"main|master|origin/main|origin/master",'
|
|
f'category=~"{PLATFORM_TEST_CATEGORY_REGEX}"'
|
|
"})"
|
|
)
|
|
PLATFORM_TEST_CURRENT_GATE_HEALTH_OBSERVED_BY_SUITE = (
|
|
f'min by (suite) (({PLATFORM_TEST_CURRENT_GATE_CHECK_HEALTH_BY_SUITE}) '
|
|
f'or ({PLATFORM_TEST_CATEGORY_HEALTH_BY_SUITE}))'
|
|
)
|
|
PLATFORM_TEST_CURRENT_GATE_HEALTH_BY_SUITE = (
|
|
f"({PLATFORM_TEST_CURRENT_GATE_HEALTH_OBSERVED_BY_SUITE}) "
|
|
f"or on(suite) (0 * ({QUALITY_GATE_SUITE_INDEX_30D}) - 1)"
|
|
)
|
|
PLATFORM_TEST_CURRENT_GATE_HEALTH = (
|
|
f"(avg(({PLATFORM_TEST_CURRENT_GATE_HEALTH_OBSERVED_BY_SUITE})) or on() vector(0))"
|
|
)
|
|
PVC_BACKUP_AGE_HOURS_BY_PVC = "sort_desc(max by (namespace, pvc) (pvc_backup_age_hours or on(namespace, pvc) ((1 - pvc_backup_health) * 999)))"
|
|
ANANKE_SELECTOR = 'job="ananke-power"'
|
|
ANANKE_UPS_DB_NAME = "Pyrphoros"
|
|
ANANKE_UPS_DB_NODE = "titan-db"
|
|
ANANKE_UPS_TETHYS_NAME = "Statera"
|
|
ANANKE_UPS_TETHYS_NODE = "titan-24"
|
|
ANANKE_UPS_DB_SELECTOR = f'{ANANKE_SELECTOR},source="{ANANKE_UPS_DB_NAME}"'
|
|
ANANKE_UPS_TETHYS_SELECTOR = f'{ANANKE_SELECTOR},source="{ANANKE_UPS_TETHYS_NAME}"'
|
|
ANANKE_UPS_ON_BATTERY = f"sum(ananke_ups_on_battery{{{ANANKE_SELECTOR}}}) or on() vector(0)"
|
|
ANANKE_UPS_LOW_BATTERY = f"sum(ananke_ups_low_battery{{{ANANKE_SELECTOR}}}) or on() vector(0)"
|
|
ANANKE_UPS_RUNTIME_MIN = f"min(ananke_ups_runtime_seconds{{{ANANKE_SELECTOR}}}) or on() vector(0)"
|
|
ANANKE_UPS_RUNTIME_HEADROOM_PERCENT = (
|
|
f"100 * min(ananke_ups_runtime_seconds{{{ANANKE_SELECTOR}}}) / "
|
|
f"clamp_min(max(ananke_ups_threshold_seconds{{{ANANKE_SELECTOR}}}), 1)"
|
|
)
|
|
ANANKE_UPS_TRIGGER_COUNT_1D = f"increase(ananke_shutdown_triggers_total{{{ANANKE_SELECTOR}}}[1d]) or on() vector(0)"
|
|
GITOPS_SELECTOR = ANANKE_SELECTOR
|
|
GITOPS_SOURCE_INFO = (
|
|
f'max by (branch, revision) (ananke_gitops_flux_source_info{{{GITOPS_SELECTOR},namespace="flux-system",name="flux-system"}})'
|
|
)
|
|
GITOPS_KUSTOMIZATION_READY_PCT = (
|
|
f"100 * sum(max by (namespace, name) (ananke_gitops_kustomization_ready{{{GITOPS_SELECTOR}}})) "
|
|
f"/ clamp_min(count(max by (namespace, name) (ananke_gitops_kustomization_ready{{{GITOPS_SELECTOR}}})), 1)"
|
|
)
|
|
GITOPS_KUSTOMIZATION_READY_COUNT = (
|
|
f"sum(max by (namespace, name) (ananke_gitops_kustomization_ready{{{GITOPS_SELECTOR}}})) or on() vector(0)"
|
|
)
|
|
GITOPS_KUSTOMIZATION_TOTAL_COUNT = (
|
|
f"count(max by (namespace, name) (ananke_gitops_kustomization_ready{{{GITOPS_SELECTOR}}})) or on() vector(0)"
|
|
)
|
|
GITOPS_KUSTOMIZATION_SUSPENDED = (
|
|
f"sum(max by (namespace, name) (ananke_gitops_kustomization_suspended{{{GITOPS_SELECTOR}}})) or on() vector(0)"
|
|
)
|
|
GITOPS_KUSTOMIZATION_NOT_SUSPENDED_PCT = (
|
|
f"100 * (1 - ({GITOPS_KUSTOMIZATION_SUSPENDED}) / clamp_min(({GITOPS_KUSTOMIZATION_TOTAL_COUNT}), 1))"
|
|
)
|
|
GITOPS_HELM_READY_PCT = (
|
|
f"100 * sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{{{GITOPS_SELECTOR}}})) "
|
|
f"/ clamp_min(count(max by (namespace, name) (ananke_gitops_helmrelease_ready{{{GITOPS_SELECTOR}}})), 1)"
|
|
)
|
|
GITOPS_HELM_READY_COUNT = (
|
|
f"sum(max by (namespace, name) (ananke_gitops_helmrelease_ready{{{GITOPS_SELECTOR}}})) or on() vector(0)"
|
|
)
|
|
GITOPS_HELM_TOTAL_COUNT = (
|
|
f"count(max by (namespace, name) (ananke_gitops_helmrelease_ready{{{GITOPS_SELECTOR}}})) or on() vector(0)"
|
|
)
|
|
GITOPS_HELM_SUSPENDED = (
|
|
f"sum(max by (namespace, name) (ananke_gitops_helmrelease_suspended{{{GITOPS_SELECTOR}}})) or on() vector(0)"
|
|
)
|
|
GITOPS_HELM_NOT_SUSPENDED_PCT = (
|
|
f"100 * (1 - ({GITOPS_HELM_SUSPENDED}) / clamp_min(({GITOPS_HELM_TOTAL_COUNT}), 1))"
|
|
)
|
|
GITOPS_SCRAPE_SUCCESS = f"min(ananke_gitops_scrape_success{{{GITOPS_SELECTOR}}}) or on() vector(0)"
|
|
GITOPS_LAST_SCRAPE_AGE = (
|
|
f"(time() - max(ananke_gitops_last_scrape_timestamp_seconds{{{GITOPS_SELECTOR}}})) or on() vector(0)"
|
|
)
|
|
ANANKE_UPS_RUNTIME_DB = (
|
|
f'max(ananke_ups_runtime_seconds{{{ANANKE_UPS_DB_SELECTOR}}}) or on() vector(0)'
|
|
)
|
|
ANANKE_UPS_RUNTIME_TETHYS = (
|
|
f'max(ananke_ups_runtime_seconds{{{ANANKE_UPS_TETHYS_SELECTOR}}}) or on() vector(0)'
|
|
)
|
|
ANANKE_UPS_ON_BATTERY_DB = (
|
|
f'max(ananke_ups_on_battery{{{ANANKE_UPS_DB_SELECTOR}}}) or on() vector(0)'
|
|
)
|
|
ANANKE_UPS_ON_BATTERY_TETHYS = (
|
|
f'max(ananke_ups_on_battery{{{ANANKE_UPS_TETHYS_SELECTOR}}}) or on() vector(0)'
|
|
)
|
|
ANANKE_UPS_BATTERY_CHARGE_DB = (
|
|
f'max(ananke_ups_battery_charge_percent{{{ANANKE_UPS_DB_SELECTOR}}}) or on() vector(0)'
|
|
)
|
|
ANANKE_UPS_BATTERY_CHARGE_TETHYS = (
|
|
f'max(ananke_ups_battery_charge_percent{{{ANANKE_UPS_TETHYS_SELECTOR}}}) or on() vector(0)'
|
|
)
|
|
ANANKE_UPS_LOAD_DB = (
|
|
f'max(ananke_ups_load_percent{{{ANANKE_UPS_DB_SELECTOR}}}) or on() vector(0)'
|
|
)
|
|
ANANKE_UPS_LOAD_TETHYS = (
|
|
f'max(ananke_ups_load_percent{{{ANANKE_UPS_TETHYS_SELECTOR}}}) or on() vector(0)'
|
|
)
|
|
ANANKE_UPS_DRAW_WATTS_DB = (
|
|
f'max((ananke_ups_load_percent{{{ANANKE_UPS_DB_SELECTOR}}} '
|
|
f'* ananke_ups_power_nominal_watts{{{ANANKE_UPS_DB_SELECTOR}}}) / 100) or on() vector(0)'
|
|
)
|
|
ANANKE_UPS_DRAW_WATTS_TETHYS = (
|
|
f'max((ananke_ups_load_percent{{{ANANKE_UPS_TETHYS_SELECTOR}}} '
|
|
f'* ananke_ups_power_nominal_watts{{{ANANKE_UPS_TETHYS_SELECTOR}}}) / 100) or on() vector(0)'
|
|
)
|
|
ANANKE_UPS_DRAW_WATTS_DB_SERIES = (
|
|
f'max((ananke_ups_load_percent{{{ANANKE_UPS_DB_SELECTOR}}} '
|
|
f'* ananke_ups_power_nominal_watts{{{ANANKE_UPS_DB_SELECTOR}}}) / 100)'
|
|
)
|
|
ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES = (
|
|
f'max((ananke_ups_load_percent{{{ANANKE_UPS_TETHYS_SELECTOR}}} '
|
|
f'* ananke_ups_power_nominal_watts{{{ANANKE_UPS_TETHYS_SELECTOR}}}) / 100)'
|
|
)
|
|
ANANKE_UPS_RUNTIME_BY_SOURCE = f"ananke_ups_runtime_seconds{{{ANANKE_SELECTOR}}}"
|
|
ANANKE_UPS_LOAD_BY_SOURCE = f"ananke_ups_load_percent{{{ANANKE_SELECTOR}}}"
|
|
ANANKE_UPS_CHARGE_BY_SOURCE = f"ananke_ups_battery_charge_percent{{{ANANKE_SELECTOR}}}"
|
|
ANANKE_UPS_TRIGGER_BY_SOURCE = f"ananke_ups_trigger_active{{{ANANKE_SELECTOR}}}"
|
|
CLIMATE_SENSOR_COUNT = "count(typhon_temperature_celsius) or on() vector(0)"
|
|
CLIMATE_TEMP_MAX = "max(typhon_temperature_celsius) or on() vector(0)"
|
|
CLIMATE_PRESSURE_CURRENT = "max(typhon_vpd_kpa) or on() vector(0)"
|
|
CLIMATE_HUMIDITY_MAX = "max(typhon_relative_humidity_percent) or on() vector(0)"
|
|
CLIMATE_TEMP_SERIES = "typhon_temperature_celsius"
|
|
CLIMATE_PRESSURE_SERIES = "typhon_vpd_kpa"
|
|
CLIMATE_HUMIDITY_SERIES = "typhon_relative_humidity_percent"
|
|
CLIMATE_DEWPOINT_SERIES = (
|
|
"(243.12 * (ln(clamp_min(typhon_relative_humidity_percent, 1) / 100) + "
|
|
"(17.62 * typhon_temperature_celsius) / (243.12 + typhon_temperature_celsius))) / "
|
|
"(17.62 - (ln(clamp_min(typhon_relative_humidity_percent, 1) / 100) + "
|
|
"(17.62 * typhon_temperature_celsius) / (243.12 + typhon_temperature_celsius)))"
|
|
)
|
|
CLIMATE_DEWPOINT_CURRENT = f"max({CLIMATE_DEWPOINT_SERIES}) or on() vector(0)"
|
|
CLIMATE_FAN_OUTLET_CURRENT = (
|
|
'max(typhon_fan_speed_level{fan_group="outlet"}) or on() vector(0)'
|
|
)
|
|
CLIMATE_FAN_INSIDE_INLET_CURRENT = (
|
|
'max(typhon_fan_speed_level{fan_group="inside_inlet"}) or on() vector(0)'
|
|
)
|
|
CLIMATE_FAN_OUTSIDE_INLET_CURRENT = (
|
|
'max(typhon_fan_speed_level{fan_group="outside_inlet"}) or on() vector(0)'
|
|
)
|
|
CLIMATE_FAN_INTERIOR_CURRENT = (
|
|
'max(typhon_fan_speed_level{fan_group="interior"}) or on() vector(0)'
|
|
)
|
|
CLIMATE_FAN_OUTLET_SERIES = (
|
|
'typhon_fan_speed_level{fan_group="outlet"}'
|
|
)
|
|
CLIMATE_FAN_INSIDE_INLET_SERIES = (
|
|
'typhon_fan_speed_level{fan_group="inside_inlet"}'
|
|
)
|
|
CLIMATE_FAN_OUTSIDE_INLET_SERIES = (
|
|
'typhon_fan_speed_level{fan_group="outside_inlet"}'
|
|
)
|
|
CLIMATE_FAN_INTERIOR_SERIES = (
|
|
'typhon_fan_speed_level{fan_group="interior"}'
|
|
)
|
|
POSTGRES_CONN_USED = (
|
|
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
|
|
'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")'
|
|
)
|
|
POSTGRES_CONN_HOTTEST = 'topk(1, sum by (datname) (pg_stat_activity_count))'
|
|
ONEOFF_JOB_OWNER = (
|
|
'label_replace(kube_job_owner{owner_kind="CronJob"}, "owner_name", "$1", "job_name", "(.*)")'
|
|
)
|
|
ONEOFF_JOB_PODS = f'(kube_pod_owner{{owner_kind="Job"}} unless on(namespace, owner_name) {ONEOFF_JOB_OWNER})'
|
|
ONEOFF_JOB_POD_AGE_HOURS = (
|
|
'((time() - kube_pod_start_time{pod!=""}) / 3600) '
|
|
f'* on(namespace,pod) group_left(owner_name) {ONEOFF_JOB_PODS} '
|
|
'* on(namespace,pod) group_left(phase) '
|
|
'max by (namespace,pod,phase) (kube_pod_status_phase{phase=~"Running|Succeeded"})'
|
|
)
|
|
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
|
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
|
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
|
TRAEFIK_NET_INGRESS = (
|
|
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
|
|
" or on() vector(0)"
|
|
)
|
|
TRAEFIK_NET_EGRESS = (
|
|
'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
|
|
" or on() vector(0)"
|
|
)
|
|
NET_CLUSTER_RX = (
|
|
'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
|
" or on() vector(0)"
|
|
)
|
|
NET_CLUSTER_TX = (
|
|
'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
|
" or on() vector(0)"
|
|
)
|
|
PHYSICAL_NET_FILTER = 'device!~"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*"'
|
|
NET_NODE_RX_PHYS = (
|
|
f'sum(rate(node_network_receive_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
|
|
)
|
|
NET_NODE_TX_PHYS = (
|
|
f'sum(rate(node_network_transmit_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
|
|
)
|
|
NET_TOTAL_EXPR = NET_NODE_TX_PHYS
|
|
NET_INGRESS_EXPR = NET_NODE_RX_PHYS
|
|
NET_EGRESS_EXPR = NET_NODE_TX_PHYS
|
|
NET_INTERNAL_EXPR = (
|
|
'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!=""}[5m]) '
|
|
'+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
|
|
' or on() vector(0)'
|
|
)
|
|
APISERVER_5XX_RATE = 'sum(rate(apiserver_request_total{code=~"5.."}[5m]))'
|
|
APISERVER_P99_LATENCY_MS = (
|
|
"histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
|
|
)
|
|
ETCD_P99_LATENCY_MS = (
|
|
"histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
|
|
)
|
|
TRAEFIK_TOTAL_5M = "sum(rate(traefik_entrypoint_requests_total[5m]))"
|
|
TRAEFIK_SUCCESS_5M = 'sum(rate(traefik_entrypoint_requests_total{code!~"5.."}[5m]))'
|
|
TRAEFIK_SLI_5M = f"({TRAEFIK_SUCCESS_5M}) / clamp_min({TRAEFIK_TOTAL_5M}, 1)"
|
|
TRAEFIK_P99_LATENCY_MS = (
|
|
"histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
|
|
)
|
|
TRAEFIK_P95_LATENCY_MS = (
|
|
"histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
|
|
)
|
|
SLO_AVAILABILITY = 0.999
|
|
|
|
|
|
def traefik_sli(window):
|
|
total = f'sum(rate(traefik_entrypoint_requests_total[{window}]))'
|
|
success = f'sum(rate(traefik_entrypoint_requests_total{{code!~"5.."}}[{window}]))'
|
|
return f"({success}) / clamp_min({total}, 1)"
|
|
|
|
|
|
def traefik_burn(window):
|
|
sli = traefik_sli(window)
|
|
return f"(1 - ({sli})) / {1 - SLO_AVAILABILITY}"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Panel factories
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def stat_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
unit="none",
|
|
decimals=None,
|
|
thresholds=None,
|
|
text_mode="value",
|
|
legend=None,
|
|
instant=False,
|
|
value_suffix=None,
|
|
links=None,
|
|
targets=None,
|
|
field_overrides=None,
|
|
description=None,
|
|
orientation=None,
|
|
wide_layout=None,
|
|
):
|
|
"""Return a Grafana stat panel definition."""
|
|
defaults = {
|
|
"color": {"mode": "thresholds"},
|
|
"mappings": [],
|
|
"thresholds": thresholds
|
|
or {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "rgba(115, 115, 115, 1)", "value": None},
|
|
{"color": "green", "value": 1},
|
|
],
|
|
},
|
|
"unit": unit,
|
|
"custom": {"displayMode": "auto"},
|
|
}
|
|
if value_suffix:
|
|
defaults["custom"]["valueSuffix"] = value_suffix
|
|
if decimals is not None:
|
|
defaults["decimals"] = decimals
|
|
target_list = targets if targets is not None else [{"expr": expr, "refId": "A"}]
|
|
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "stat",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": target_list,
|
|
"fieldConfig": {"defaults": defaults, "overrides": field_overrides or []},
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "area",
|
|
"justifyMode": "center",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
|
"textMode": text_mode,
|
|
},
|
|
}
|
|
if orientation:
|
|
panel["options"]["orientation"] = orientation
|
|
if wide_layout is not None:
|
|
panel["options"]["wideLayout"] = wide_layout
|
|
if legend and len(panel["targets"]) == 1:
|
|
panel["targets"][0]["legendFormat"] = legend
|
|
if instant:
|
|
for t in panel["targets"]:
|
|
t.setdefault("instant", True)
|
|
if links:
|
|
panel["links"] = links
|
|
if description:
|
|
panel["description"] = description
|
|
return panel
|
|
|
|
|
|
def gauge_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
min_value=0,
|
|
max_value=1,
|
|
thresholds=None,
|
|
links=None,
|
|
):
|
|
return {
|
|
"id": panel_id,
|
|
"type": "gauge",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [{"expr": expr, "refId": "A"}],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"min": min_value,
|
|
"max": max_value,
|
|
"thresholds": thresholds
|
|
or {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": max_value},
|
|
],
|
|
},
|
|
},
|
|
"overrides": [],
|
|
},
|
|
"options": {
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
|
"orientation": "auto",
|
|
"showThresholdMarkers": False,
|
|
"showThresholdLabels": False,
|
|
},
|
|
**({"links": links} if links else {}),
|
|
}
|
|
|
|
|
|
def timeseries_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
unit="none",
|
|
max_value=None,
|
|
legend=None,
|
|
legend_display="table",
|
|
legend_placement="bottom",
|
|
legend_calcs=None,
|
|
time_from=None,
|
|
links=None,
|
|
targets=None,
|
|
field_overrides=None,
|
|
description=None,
|
|
data_links=None,
|
|
):
|
|
"""Return a Grafana time-series panel definition."""
|
|
target_list = targets if targets is not None else [{"expr": expr, "refId": "A"}]
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "timeseries",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": target_list,
|
|
"fieldConfig": {"defaults": {"unit": unit}, "overrides": field_overrides or []},
|
|
"options": {
|
|
"legend": {
|
|
"displayMode": legend_display,
|
|
"placement": legend_placement,
|
|
},
|
|
"tooltip": {"mode": "multi"},
|
|
},
|
|
}
|
|
if max_value is not None:
|
|
panel["fieldConfig"]["defaults"]["max"] = max_value
|
|
if legend and len(panel["targets"]) == 1:
|
|
panel["targets"][0]["legendFormat"] = legend
|
|
if legend_calcs:
|
|
panel["options"]["legend"]["calcs"] = legend_calcs
|
|
if time_from:
|
|
panel["timeFrom"] = time_from
|
|
if links:
|
|
panel["links"] = links
|
|
if data_links:
|
|
panel["fieldConfig"]["defaults"]["links"] = data_links
|
|
if description:
|
|
panel["description"] = description
|
|
return panel
|
|
|
|
|
|
def state_timeline_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
description,
|
|
thresholds,
|
|
unit="percent",
|
|
min_value=0,
|
|
max_value=100,
|
|
legend="{{suite}}",
|
|
links=None,
|
|
data_links=None,
|
|
):
|
|
"""Return a lane-style state timeline panel for categorical health over time."""
|
|
defaults = {
|
|
"color": {"mode": "thresholds"},
|
|
"unit": unit,
|
|
"thresholds": thresholds,
|
|
"custom": {
|
|
"fillOpacity": 70,
|
|
"lineWidth": 0,
|
|
"spanNulls": True,
|
|
},
|
|
}
|
|
if min_value is not None:
|
|
defaults["min"] = min_value
|
|
if max_value is not None:
|
|
defaults["max"] = max_value
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "state-timeline",
|
|
"title": title,
|
|
"description": description,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [
|
|
{
|
|
"expr": expr,
|
|
"refId": "A",
|
|
"legendFormat": legend,
|
|
"format": "time_series",
|
|
"instant": False,
|
|
"range": True,
|
|
}
|
|
],
|
|
"fieldConfig": {"defaults": defaults, "overrides": []},
|
|
"options": {
|
|
"mergeValues": True,
|
|
"showValue": "never",
|
|
"legend": {"displayMode": "list", "placement": "bottom"},
|
|
"tooltip": {"mode": "single", "sort": "none"},
|
|
},
|
|
}
|
|
if links:
|
|
panel["links"] = links
|
|
if data_links:
|
|
panel["fieldConfig"]["defaults"]["links"] = data_links
|
|
return panel
|
|
|
|
|
|
def apply_bar_timeseries_style(panel, *, stacked=False, fill_opacity=70):
|
|
"""Make a time-series panel read as volume bars instead of interpolated lines."""
|
|
panel["fieldConfig"]["defaults"]["custom"] = {
|
|
"drawStyle": "bars",
|
|
"barAlignment": 0,
|
|
"barWidthFactor": 0.72,
|
|
"lineWidth": 0,
|
|
"fillOpacity": fill_opacity,
|
|
"gradientMode": "none",
|
|
"showPoints": "never",
|
|
"spanNulls": True,
|
|
}
|
|
if stacked:
|
|
panel["fieldConfig"]["defaults"]["custom"]["stacking"] = {"mode": "normal", "group": "A"}
|
|
return panel
|
|
|
|
|
|
def fixed_color_overrides(series_colors):
|
|
"""Return fixed-color overrides keyed by exact series name."""
|
|
return [
|
|
{
|
|
"matcher": {"id": "byName", "options": series_name},
|
|
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": color}}],
|
|
}
|
|
for series_name, color in series_colors.items()
|
|
]
|
|
|
|
|
|
def table_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
unit="none",
|
|
transformations=None,
|
|
instant=False,
|
|
options=None,
|
|
filterable=True,
|
|
footer=None,
|
|
format=None,
|
|
description=None,
|
|
field_overrides=None,
|
|
links=None,
|
|
):
|
|
"""Return a Grafana table panel definition."""
|
|
# Optional PromQL subquery helpers in expr: share(), etc.
|
|
panel_options = {"showHeader": True, "columnFilters": False}
|
|
if options:
|
|
panel_options.update(options)
|
|
if footer is not None:
|
|
panel_options["footer"] = footer
|
|
field_defaults = {"unit": unit, "custom": {"filterable": filterable}}
|
|
target = {"expr": expr, "refId": "A", **({"instant": True} if instant else {})}
|
|
if format:
|
|
target["format"] = format
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "table",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [target],
|
|
"fieldConfig": {"defaults": field_defaults, "overrides": field_overrides or []},
|
|
"options": panel_options,
|
|
}
|
|
if transformations:
|
|
panel["transformations"] = transformations
|
|
if description:
|
|
panel["description"] = description
|
|
if links:
|
|
panel["links"] = links
|
|
return panel
|
|
|
|
|
|
def pie_panel(panel_id, title, expr, grid, *, links=None, description=None):
|
|
"""Return a pie chart panel with readable namespace labels."""
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "piechart",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"color": {"mode": "palette-classic"},
|
|
},
|
|
"overrides": [],
|
|
},
|
|
"options": {
|
|
"legend": {"displayMode": "list", "placement": "right"},
|
|
"pieType": "pie",
|
|
"displayLabels": [],
|
|
"tooltip": {"mode": "single"},
|
|
"colorScheme": "interpolateSpectral",
|
|
"colorBy": "value",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
|
},
|
|
}
|
|
if links:
|
|
panel["links"] = links
|
|
if description:
|
|
panel["description"] = description
|
|
return panel
|
|
|
|
|
|
def namespace_scope_variable(var_name, label):
|
|
options = [
|
|
{
|
|
"text": "workload namespaces only",
|
|
"value": NAMESPACE_SCOPE_WORKLOAD,
|
|
"selected": True,
|
|
},
|
|
{"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False},
|
|
{
|
|
"text": "infrastructure namespaces only",
|
|
"value": NAMESPACE_SCOPE_INFRA,
|
|
"selected": False,
|
|
},
|
|
]
|
|
query = (
|
|
"workload namespaces only : "
|
|
+ NAMESPACE_SCOPE_WORKLOAD
|
|
+ ",all namespaces : "
|
|
+ NAMESPACE_SCOPE_ALL
|
|
+ ",infrastructure namespaces only : "
|
|
+ NAMESPACE_SCOPE_INFRA
|
|
)
|
|
return {
|
|
"name": var_name,
|
|
"label": label,
|
|
"type": "custom",
|
|
"query": query,
|
|
"current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True},
|
|
"options": options,
|
|
"hide": 2,
|
|
"multi": False,
|
|
"includeAll": False,
|
|
"refresh": 1,
|
|
"sort": 0,
|
|
"skipUrlSync": False,
|
|
}
|
|
|
|
|
|
def namespace_scope_links(var_name):
|
|
def with_value(value):
|
|
encoded = urllib.parse.quote(value, safe="")
|
|
params = []
|
|
for other in NAMESPACE_SCOPE_VARS:
|
|
if other == var_name:
|
|
params.append(f"var-{other}={encoded}")
|
|
else:
|
|
params.append(f"var-{other}=${{{other}}}")
|
|
return "?" + "&".join(params)
|
|
|
|
return [
|
|
{"title": "Workload namespaces only", "url": with_value(NAMESPACE_SCOPE_WORKLOAD), "targetBlank": False},
|
|
{"title": "All namespaces", "url": with_value(NAMESPACE_SCOPE_ALL), "targetBlank": False},
|
|
{
|
|
"title": "Infrastructure namespaces only",
|
|
"url": with_value(NAMESPACE_SCOPE_INFRA),
|
|
"targetBlank": False,
|
|
},
|
|
]
|
|
|
|
|
|
def testing_suite_variable():
|
|
options = [
|
|
{
|
|
"text": suite,
|
|
"value": suite,
|
|
"selected": False,
|
|
}
|
|
for suite in PLATFORM_TEST_SUITE_NAMES
|
|
]
|
|
query = ",".join(f"{suite} : {suite}" for suite in PLATFORM_TEST_SUITE_NAMES)
|
|
return {
|
|
"name": "suite",
|
|
"label": "Suite",
|
|
"type": "custom",
|
|
"query": query,
|
|
"current": {"text": "All", "value": "$__all", "selected": True},
|
|
"options": options,
|
|
"hide": 0,
|
|
"multi": False,
|
|
"includeAll": True,
|
|
"allValue": PLATFORM_TEST_SUITE_VARIABLE_ALL_MATCHER,
|
|
"refresh": 1,
|
|
"sort": 1,
|
|
"skipUrlSync": False,
|
|
}
|
|
|
|
|
|
def testing_case_variable():
|
|
return {
|
|
"name": "test",
|
|
"label": "Test Case",
|
|
"type": "query",
|
|
"query": (
|
|
"query_result(topk(250, count by (test) (max_over_time("
|
|
f'platform_quality:test_case_health_rate:percent_1h{{suite=~"${{suite:regex}}",branch!="",'
|
|
f'branch=~"${{branch:regex}}",test!="",test!="__no_test_cases__",'
|
|
f'category!~"{PLATFORM_TEST_SUPPORT_CATEGORY_REGEX}"}}'
|
|
"[$__range]))))"
|
|
),
|
|
"regex": '/test="([^"]+)"/',
|
|
"current": {"text": "All", "value": "$__all", "selected": True},
|
|
"options": [],
|
|
"hide": 0,
|
|
"multi": False,
|
|
"includeAll": True,
|
|
"allValue": ".*",
|
|
"refresh": 2,
|
|
"sort": 1,
|
|
"skipUrlSync": False,
|
|
}
|
|
|
|
|
|
def testing_branch_variable():
|
|
options = [
|
|
{
|
|
"text": branch,
|
|
"value": branch,
|
|
"selected": False,
|
|
}
|
|
for branch in PLATFORM_TEST_BRANCH_OPTIONS
|
|
]
|
|
query = ",".join(f"{branch} : {branch}" for branch in PLATFORM_TEST_BRANCH_OPTIONS)
|
|
return {
|
|
"name": "branch",
|
|
"label": "Branch",
|
|
"type": "custom",
|
|
"query": query,
|
|
"current": {"text": "All", "value": "$__all", "selected": True},
|
|
"options": options,
|
|
"hide": 0,
|
|
"multi": False,
|
|
"includeAll": True,
|
|
"allValue": ".*",
|
|
"refresh": 0,
|
|
"sort": 0,
|
|
"skipUrlSync": False,
|
|
}
|
|
|
|
|
|
def jenkins_base_variable():
|
|
return {
|
|
"name": "jenkins_base",
|
|
"label": "Jenkins Base URL",
|
|
"type": "textbox",
|
|
"query": JENKINS_UI_BASE_DEFAULT,
|
|
"current": {
|
|
"text": JENKINS_UI_BASE_DEFAULT,
|
|
"value": JENKINS_UI_BASE_DEFAULT,
|
|
"selected": True,
|
|
},
|
|
"hide": 0,
|
|
"skipUrlSync": False,
|
|
}
|
|
|
|
|
|
def jenkins_suite_links(base_var="${jenkins_base}"):
|
|
links = [{"title": "Open Jenkins", "url": f"{base_var}/", "targetBlank": True}]
|
|
for suite in PLATFORM_TEST_SUITE_NAMES:
|
|
job = PLATFORM_TEST_JENKINS_JOB_BY_SUITE.get(suite, suite)
|
|
encoded_job = urllib.parse.quote(job, safe="")
|
|
links.append(
|
|
{
|
|
"title": f"{suite}: Job",
|
|
"url": f"{base_var}/job/{encoded_job}/",
|
|
"targetBlank": True,
|
|
}
|
|
)
|
|
links.append(
|
|
{
|
|
"title": f"{suite}: Last Artifacts",
|
|
"url": f"{base_var}/job/{encoded_job}/lastCompletedBuild/artifact/",
|
|
"targetBlank": True,
|
|
}
|
|
)
|
|
return links
|
|
|
|
|
|
def jenkins_artifact_data_links(base_var="${jenkins_base}"):
|
|
return [
|
|
{
|
|
"title": "Open build artifacts",
|
|
"url": f"{base_var}/job/${{__field.labels.jenkins_job}}/${{__field.labels.build_number}}/artifact/",
|
|
"targetBlank": True,
|
|
},
|
|
{
|
|
"title": "Open build",
|
|
"url": f"{base_var}/job/${{__field.labels.jenkins_job}}/${{__field.labels.build_number}}/",
|
|
"targetBlank": True,
|
|
},
|
|
]
|
|
|
|
|
|
def jenkins_latest_artifact_data_links(base_var="${jenkins_base}"):
|
|
return [
|
|
{
|
|
"title": "Open latest artifacts",
|
|
"url": f"{base_var}/job/${{__field.labels.jenkins_job}}/lastCompletedBuild/artifact/",
|
|
"targetBlank": True,
|
|
},
|
|
{
|
|
"title": "Open Jenkins job",
|
|
"url": f"{base_var}/job/${{__field.labels.jenkins_job}}/",
|
|
"targetBlank": True,
|
|
},
|
|
]
|
|
|
|
|
|
def bargauge_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
unit="none",
|
|
legend=None,
|
|
links=None,
|
|
limit=None,
|
|
sort_order="desc",
|
|
thresholds=None,
|
|
decimals=None,
|
|
instant=False,
|
|
overrides=None,
|
|
data_links=None,
|
|
include_color=True,
|
|
description=None,
|
|
):
|
|
"""Return a bar gauge panel with label-aware reduction."""
|
|
cleaned_expr = expr.strip()
|
|
if not cleaned_expr.startswith(("sort(", "sort_desc(")):
|
|
if sort_order == "desc":
|
|
expr = f"sort_desc({expr})"
|
|
elif sort_order == "asc":
|
|
expr = f"sort({expr})"
|
|
defaults = {}
|
|
if include_color:
|
|
defaults["color"] = {"mode": "thresholds"}
|
|
defaults.update(
|
|
{
|
|
"unit": unit,
|
|
"min": 0,
|
|
"max": 100 if unit == "percent" else None,
|
|
"thresholds": thresholds
|
|
or {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "orange", "value": 70},
|
|
{"color": "red", "value": 85},
|
|
],
|
|
},
|
|
}
|
|
)
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "bargauge",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [
|
|
{
|
|
"expr": expr,
|
|
"refId": "A",
|
|
"legendFormat": legend or "{{node}}",
|
|
**({"instant": True} if instant else {}),
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": defaults,
|
|
"overrides": [],
|
|
},
|
|
"options": {
|
|
"displayMode": "basic",
|
|
"orientation": "horizontal",
|
|
"reduceOptions": {
|
|
"calcs": ["lastNotNull"],
|
|
"fields": "",
|
|
"values": False,
|
|
},
|
|
},
|
|
}
|
|
if overrides:
|
|
panel["fieldConfig"]["overrides"].extend(overrides)
|
|
if decimals is not None:
|
|
panel["fieldConfig"]["defaults"]["decimals"] = decimals
|
|
if links:
|
|
panel["links"] = links
|
|
if description:
|
|
panel["description"] = description
|
|
if data_links:
|
|
panel["fieldConfig"]["defaults"]["links"] = data_links
|
|
# Keep bars ordered by value descending for readability.
|
|
panel["transformations"] = [
|
|
{
|
|
"id": "sortBy",
|
|
"options": {"fields": ["Value"], "order": sort_order},
|
|
}
|
|
]
|
|
if limit:
|
|
panel["transformations"].append({"id": "limit", "options": {"limit": limit}})
|
|
return panel
|
|
|
|
|
|
def set_bargauge_display_mode(panels, display_mode):
|
|
"""Apply a display mode to bar gauges, including gauges inside collapsed rows."""
|
|
for panel in panels:
|
|
if panel.get("type") == "bargauge":
|
|
panel["options"]["displayMode"] = display_mode
|
|
if panel.get("panels"):
|
|
set_bargauge_display_mode(panel["panels"], display_mode)
|
|
|
|
|
|
def text_panel(panel_id, title, content, grid):
|
|
return {
|
|
"id": panel_id,
|
|
"type": "text",
|
|
"title": title,
|
|
"gridPos": grid,
|
|
"datasource": None,
|
|
"options": {"mode": "markdown", "content": content},
|
|
}
|
|
|
|
|
|
def row_panel(panel_id, title, y, *, collapsed=True, panels=None):
|
|
"""Return a Grafana row, optionally carrying collapsed child panels."""
|
|
return {
|
|
"id": panel_id,
|
|
"type": "row",
|
|
"title": title,
|
|
"gridPos": {"h": 1, "w": 24, "x": 0, "y": y},
|
|
"collapsed": collapsed,
|
|
**({"panels": panels or []} if collapsed else {}),
|
|
}
|
|
|
|
|
|
def apply_panel_descriptions(panels, descriptions):
|
|
"""Attach concise help text to panels, including panels inside collapsed rows."""
|
|
for panel in panels:
|
|
title = panel.get("title")
|
|
if title in descriptions and not panel.get("description"):
|
|
panel["description"] = descriptions[title]
|
|
if panel.get("panels"):
|
|
apply_panel_descriptions(panel["panels"], descriptions)
|
|
|
|
|
|
DASHBOARD_LINK_TITLES = {
|
|
"atlas-overview": "Open Atlas Overview",
|
|
"atlas-pods": "Open Atlas Pods",
|
|
"atlas-nodes": "Open Atlas Nodes",
|
|
"atlas-storage": "Open Atlas Storage",
|
|
"atlas-network": "Open Atlas Network",
|
|
"atlas-mail": "Open Atlas Mail",
|
|
"atlas-jobs": "Atlas Testing",
|
|
"atlas-testing": "Atlas Testing",
|
|
"atlas-power": "Open Atlas Power",
|
|
"atlas-gitops": "Open Atlas GitOps",
|
|
"atlas-gpu": "Open Atlas GPU",
|
|
}
|
|
|
|
|
|
def link_to(uid):
|
|
return [
|
|
{
|
|
"title": DASHBOARD_LINK_TITLES.get(uid, f"Open {uid} dashboard"),
|
|
"url": f"/d/{uid}",
|
|
"targetBlank": True,
|
|
}
|
|
]
|
|
|
|
|
|
def overview_link_to(uid):
|
|
"""Return the historical Overview dashboard link label."""
|
|
return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}]
|
|
|
|
|
|
OVERVIEW_PANEL_DESCRIPTIONS = {
|
|
"Control Plane Ready": "Control-plane nodes currently Ready; full count is good, lower means Kubernetes core capacity is missing.",
|
|
"Control Plane Workloads": "Non-core pods running on control-plane nodes; zero is good because control nodes should stay focused.",
|
|
"Stuck Terminating": "Pods that Kubernetes cannot finish deleting; zero is good, growth means cleanup or storage may be stuck.",
|
|
"Atlas Availability (365d)": "Rolling one-year Atlas availability; higher is better, below target means users saw downtime.",
|
|
"Problem Pods": "Pods in unhealthy phases; zero is good, any count means a workload needs attention.",
|
|
"CrashLoop / ImagePull": "Pods restarting or unable to pull images; zero is good, any count usually blocks a service.",
|
|
"Workers Ready": "Worker nodes currently Ready; full count is good, lower means less place to run services.",
|
|
"Hottest node: CPU": "Highest worker CPU load right now; lower is calmer, hot nodes may need pods moved.",
|
|
"Hottest node: RAM": "Highest worker memory use right now; lower is safer, high values risk evictions.",
|
|
"Hottest node: NET (rx+tx)": "Busiest node network rate; spikes can reveal traffic concentration or noisy services.",
|
|
"Hottest node: I/O (r+w)": "Busiest node disk I/O rate; high values can explain slow storage-backed apps.",
|
|
"Astreae Usage": "Percent of Astreae used; lower is safer, high values reduce storage headroom.",
|
|
"Asteria Usage": "Percent of Asteria used; lower is safer, high values reduce storage headroom.",
|
|
"Astreae Free": "Free space on Astreae; higher is better for backups and workload growth.",
|
|
"Asteria Free": "Free space on Asteria; higher is better for backups and workload growth.",
|
|
"Pyrphoros UPS Current": "Live Pyrphoros UPS draw and runtime; stable runtime means the lab can ride out short outages.",
|
|
"Statera UPS Current": "Live Statera UPS draw and runtime; stable runtime means the lab can ride out short outages.",
|
|
"UPS History (Power Draw)": "UPS power draw over time; steady draw is normal, spikes show sudden load changes.",
|
|
"Current Enclosure Temperature": "Current tent temperature in C and F; moderate values protect hardware and plants.",
|
|
"Current Enclosure Climate": "Current humidity and VPD; in-range values mean the enclosure climate is stable.",
|
|
"Enclosure Climate History": "Temperature, humidity, and VPD over time; smooth movement is healthy, sharp swings need attention.",
|
|
"Fan Intensity History": "Fan levels from Off to 10; warmer colors mean stronger cooling response and more thermal pressure.",
|
|
"Flux Source": "Git branch Flux is applying; this should normally be the intended production branch.",
|
|
"Current Gate Health": "Current gate-check health across suites; skipped or not-applicable checks count as healthy, failures lower it.",
|
|
"CI Run Success (24h)": "Percent of published quality-gate CI runs that completed successfully in 24h; this is automation health, not raw test pass rate.",
|
|
"Failed Runs (24h)": "Published quality-gate runs that failed in 24h; zero is good, any value needs a look.",
|
|
"Suites With Runs (24h)": "Configured suites with at least one published quality-gate run in 24h; full count means the dashboard is fresh.",
|
|
"Avg Coverage": "Average latest line coverage across suites; higher means code is better protected by tests.",
|
|
"LOC Clean Suites": "Suites with no source files over 500 LOC; full count is good for maintainability.",
|
|
"GitOps Health": "Flux readiness and suspension health over time; blue is perfect, warmer colors mean drift or pause.",
|
|
"One-off Job Pods (age hours)": "Temporary job pods by age; low or empty is good, old pods usually need cleanup.",
|
|
"Ariadne Run Volume": "Ariadne automation attempts and failures; attempts show activity, failures show work to investigate.",
|
|
"Test Category Health": "Current category health across suites; skipped tests count as healthy, failures lower the lane.",
|
|
"Jenkins Last Success (h, newest first)": "Age of recent Jenkins successes; lower is fresher and better.",
|
|
"Jenkins Last Failure (h, newest first)": "Age of recent Jenkins failures; lower means a failure happened more recently.",
|
|
"PVC Backup Health / Age": "Restic backup age by PVC; lower is better, very old backups mean restore risk.",
|
|
"Mail Sent (1d)": "Outbound mail sent in the last day; useful context for mail health and bounce rates.",
|
|
"Mail Bounces (1d)": "Outbound mail bounce rate and count; zero is best, high values risk delivery reputation.",
|
|
"Mail Success Rate (1d)": "Outbound mail success rate; higher is better for user notifications.",
|
|
"Mail Limit Used (30d)": "Postmark monthly send limit used; lower leaves more quota headroom.",
|
|
"Postgres Connections Used": "Current Postgres connections; lower leaves room for apps during spikes.",
|
|
"Postgres Hottest Connections": "Database with the most active connections; high values identify the pressure source.",
|
|
"Namespace CPU Share": "CPU share by namespace in the selected scope; big slices show who is using compute.",
|
|
"Namespace GPU Utilization": "Measured GPU activity share by namespace in the selected scope. Ambiguous shared-device activity is grouped as shared; idle appears only when utilization is zero.",
|
|
"Namespace RAM Share": "Memory share by namespace in the selected scope; big slices show who may drive pressure.",
|
|
"Worker Node CPU": "Worker CPU over time; lower is calmer, sustained high load may need rescheduling.",
|
|
"Worker Node RAM": "Worker memory over time; lower is safer, sustained high use risks evictions.",
|
|
"Control plane CPU": "Control-plane CPU over time; low steady usage means Kubernetes has control headroom.",
|
|
"Control plane RAM": "Control-plane memory over time; low steady usage means Kubernetes has control headroom.",
|
|
"Node Pod Share": "Share of pods per node; uneven share can reveal overloaded workers.",
|
|
"Top Nodes by Pod Count": "Nodes with the most pods; lower and balanced is easier to operate.",
|
|
"Cluster Ingress Throughput": "Traffic entering the cluster; spikes should line up with expected usage.",
|
|
"Cluster Egress Throughput": "Traffic leaving the cluster; spikes should line up with expected usage.",
|
|
"Intra-Cluster Throughput": "Traffic inside the cluster; high values can expose chatty services.",
|
|
"Root Filesystem Usage": "Node root disk usage; lower is safer, high values can break kubelet.",
|
|
"Nodes Closest to Full Astraios Disks": "Astraios disk fullness by node; lower is safer for storage reliability.",
|
|
}
|
|
|
|
|
|
TESTING_PANEL_DESCRIPTIONS = {
|
|
"Current Gate Health (%)": "Average latest required gate checks passing across selected suites; this is the current quality state.",
|
|
"CI Run Success Rate (24h)": "Percent of selected quality-gate CI runs that completed successfully in 24h; this is run health, not individual test pass rate.",
|
|
"CI Run Success Rate (30d)": "Percent of selected quality-gate CI runs that completed successfully in 30d; higher means more stable automation.",
|
|
"Failed Runs (24h)": "Selected quality-gate runs that failed in 24h; zero is good and anything else needs a look.",
|
|
"CI Runs (24h)": "Selected quality-gate CI run count in 24h; zero means the dashboard may be stale.",
|
|
"Suite Freshness (24h)": "Percent of selected suites with at least one quality-gate CI run in 24h; 100% means inputs are fresh.",
|
|
"Avg Coverage (%)": "Average latest line coverage for selected suites; higher means better test protection.",
|
|
"Suites with LOC >500": "Selected suites with oversized source files; zero is good for maintainability.",
|
|
"Latest Gate Health by Suite": "Latest required gate health by suite; skipped and not-applicable results are healthy, failures lower it.",
|
|
"CI Run Success by Suite (24h)": "24h CI run success rate by suite; lower rows mean recent jobs failed, aborted, or could not complete cleanly.",
|
|
"Coverage by Suite (Latest, gate 95)": "Latest suite coverage; 95%+ is acceptable and 100% is strongest.",
|
|
"Files <=500 LOC by Suite (Latest)": "Percent of source files within the 500-line limit; higher is easier to maintain.",
|
|
"CI Runs And Test Result History": "Recent CI run, coverage, LOC, and raw test-result trends for selected suites.",
|
|
"CI Run Success by Suite (7d rolling)": "Seven-day rolling CI run success rate by suite; this is run completion history, not raw test pass history.",
|
|
"Test Category Health History": "Health by test category; skipped tests count as healthy, failures lower the lane.",
|
|
"Daily Run Volume (Selected Scope)": "Rolling daily counts of published quality-gate runs; volume explains confidence.",
|
|
"Coverage History by Suite": "Coverage over time by suite; rising lines mean better test protection.",
|
|
"Files <=500 LOC History by Suite": "LOC compliance over time; blue lanes mean files stay within the size limit.",
|
|
"Check Failure Rates By Suite": "Failure percent by check family; blue is zero failures, warmer colors show blockers.",
|
|
"Tests Failure Rate": "Percent of test checks currently failing; blue means tests are clean.",
|
|
"Coverage Failure Rate": "Percent of coverage checks currently failing; blue means coverage gates pass.",
|
|
"LOC Failure Rate": "Percent of LOC checks currently failing; blue means file size gates pass.",
|
|
"Style Failure Rate": "Percent of style checks currently failing; blue means style/docs gates pass.",
|
|
"Gate Glue Failure Rate": "Percent of metric-contract checks failing; blue means dashboard telemetry is trustworthy.",
|
|
"SonarQube Failure Rate": "Percent of Sonar checks failing; blue means Sonar quality gates pass.",
|
|
"Supply Chain Failure Rate": "Percent of supply-chain checks failing; blue means artifact/image checks pass.",
|
|
"Check Healthy Rates By Suite": "Healthy percent by check family; blue means all selected checks are good.",
|
|
"Tests Healthy Rate": "Percent of test checks passing or not applicable; higher is better.",
|
|
"Coverage Healthy Rate": "Percent of coverage checks passing or not applicable; higher is better.",
|
|
"LOC Healthy Rate": "Percent of LOC checks passing or not applicable; higher is better.",
|
|
"Style Healthy Rate": "Percent of style checks passing or not applicable; higher is better.",
|
|
"Gate Glue Healthy Rate": "Percent of telemetry-contract checks passing; higher means cleaner reporting.",
|
|
"SonarQube Healthy Rate": "Percent of Sonar checks passing or not applicable; higher is better.",
|
|
"Supply Chain Healthy Rate": "Percent of supply-chain checks passing or not applicable; higher is better.",
|
|
"Test Drilldowns And Problem Tests": "Test-case detail for finding which tests are hurting reliability.",
|
|
"Problematic Tests Over Time (Top failures)": "Current outlier tests by rolling 24h failures; tests need repeat failures to stay visible.",
|
|
"Most Problematic Test by Suite (30d)": "Worst test per suite summed over 30d; high counts can be historical debt.",
|
|
"Selected Test Pass/Fail History": "Hourly pass/fail/skipped volume for the selected test filter.",
|
|
"Selected Test Pass Rate History": "Pass rate history for the selected test filter; higher means the test is stable.",
|
|
"Telemetry Completeness And Branches": "Checks that each suite publishes the data this dashboard needs.",
|
|
"Tests Metrics Present by Suite": "Whether suite-level test counts are present; 100% means the suite is reporting.",
|
|
"Checks Metrics Present by Suite": "Whether gate check metrics are present; 100% means health panels have inputs.",
|
|
"Coverage Metrics Present by Suite": "Whether coverage metrics are present; 100% means coverage panels are reliable.",
|
|
"LOC Compliance Metrics Present by Suite": "Whether LOC metrics are present; 100% means size panels are reliable.",
|
|
"Test-Case Metrics Present by Suite": "Whether per-test metrics are present; 100% enables drilldowns.",
|
|
"Real Test Cases Present by Suite": "Whether real test names are present; 100% means not just placeholder telemetry.",
|
|
"Recent Branch Evidence by Suite (30d)": "Branches with recent CI evidence; unexpected branches can mean drift or stale work.",
|
|
"Primary Branch Clean by Suite (30d)": "Percent clean of non-primary branch evidence; 100% means only main/master is reporting.",
|
|
"SonarQube Project Health": "SonarQube availability, projects, fetch errors, and gate status.",
|
|
"SonarQube API Up": "Whether the SonarQube exporter can reach SonarQube; 1 is good.",
|
|
"Sonar Projects (Selected)": "Selected SonarQube project count; zero means Sonar is not tracking that suite.",
|
|
"Sonar Gate Fetch Errors": "Sonar exporter fetch errors; zero is good because stale Sonar data misleads.",
|
|
"Sonar Gate Status Mix (Selected)": "Mix of Sonar gate states; OK is good and non-OK needs cleanup.",
|
|
"Sonar Gate Health by Project": "Sonar gate health over time by project; blue means OK.",
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Dashboard builders
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def build_overview():
|
|
panels = []
|
|
overview_link = overview_link_to
|
|
climate_drop_labels = "job,instance,pod,service,endpoint,namespace,controller_name,port_name,fan_group"
|
|
climate_temp_series = f"max without ({climate_drop_labels}) (typhon_temperature_celsius != 0)"
|
|
climate_humidity_series = f"max without ({climate_drop_labels}) (typhon_relative_humidity_percent != 0)"
|
|
climate_pressure_series = f"max without ({climate_drop_labels}) (typhon_vpd_kpa != 0)"
|
|
overview_pvc_backup_metric_presence = (
|
|
'count({__name__=~"pvc_backup_(count|last_success_timestamp_seconds|health_reason)",driver="restic"})'
|
|
)
|
|
overview_pvc_backup_missing = (
|
|
'label_replace(label_replace(vector(999), "namespace", "maintenance", "__name__", ".*"), '
|
|
'"pvc", "backup-telemetry-missing", "__name__", ".*")'
|
|
)
|
|
overview_pvc_backup_age = (
|
|
'max by (namespace, pvc) (((time() - pvc_backup_last_success_timestamp_seconds{driver="restic"}) / 3600) '
|
|
'or on(namespace,pvc,volume,driver) ((((pvc_backup_health_reason{driver="restic",reason=~"missing|no_completed|lookup_failed|unknown_timestamp"} > 0) '
|
|
f'* (pvc_backup_count{{driver="restic"}} > bool 0)) * 999))) or on() '
|
|
f'(({overview_pvc_backup_missing}) unless on() (({overview_pvc_backup_metric_presence}) > 0))'
|
|
)
|
|
|
|
def overview_metric_pair_expr(first_expr, first_name, second_expr, second_name):
|
|
return (
|
|
f'label_replace({first_expr}, "metric", "{first_name}", "__name__", ".*") '
|
|
f'or label_replace({second_expr}, "metric", "{second_name}", "__name__", ".*")'
|
|
)
|
|
|
|
def overview_platform_test_success_targets():
|
|
suites = [
|
|
("ariadne", "ariadne"),
|
|
("metis", "metis"),
|
|
("ananke", "ananke"),
|
|
("atlasbot", "atlasbot"),
|
|
("lesavka", "lesavka"),
|
|
("pegasus", "pegasus|pegasus-health|pegasus_health"),
|
|
("soteria", "soteria"),
|
|
("titan-iac", "titan-iac|titan_iac"),
|
|
("bstein-home", "bstein-home|bstein_home"),
|
|
("arcanagon", "arcanagon"),
|
|
("data-prepper", "data-prepper|data_prepper"),
|
|
]
|
|
targets = []
|
|
for index, (legend, suite_regex) in enumerate(suites):
|
|
total = f'sum({platform_runs_increase(f"suite=~\"{suite_regex}\",{PLATFORM_TEST_EXPORT_FILTER}", "1h")})'
|
|
passed = (
|
|
f'sum({platform_runs_increase(f"suite=~\"{suite_regex}\",status=~\"{PLATFORM_TEST_SUCCESS_STATUS}\",{PLATFORM_TEST_EXPORT_FILTER}", "1h")})'
|
|
)
|
|
targets.append(
|
|
{
|
|
"refId": chr(ord("A") + index),
|
|
"expr": f"(100 * ({passed}) / clamp_min(({total}), 1)) and on() (({total}) > 0) or on() vector(0)",
|
|
"legendFormat": legend,
|
|
}
|
|
)
|
|
return targets
|
|
|
|
age_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 6},
|
|
{"color": "orange", "value": 24},
|
|
{"color": "red", "value": 48},
|
|
],
|
|
}
|
|
|
|
row1_stats = [
|
|
{
|
|
"id": 2,
|
|
"title": "Control Plane Ready",
|
|
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
|
|
"kind": "gauge",
|
|
"max_value": CONTROL_TOTAL,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "green", "value": CONTROL_TOTAL},
|
|
],
|
|
},
|
|
},
|
|
{
|
|
"id": 3,
|
|
"title": "Control Plane Workloads",
|
|
"expr": CONTROL_WORKLOADS_EXPR,
|
|
"kind": "stat",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 3},
|
|
],
|
|
},
|
|
"links": overview_link("atlas-pods"),
|
|
},
|
|
{
|
|
"id": 5,
|
|
"title": "Stuck Terminating",
|
|
"expr": STUCK_TERMINATING_EXPR,
|
|
"kind": "stat",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 3},
|
|
],
|
|
},
|
|
"links": overview_link("atlas-pods"),
|
|
},
|
|
{
|
|
"id": 27,
|
|
"title": "Atlas Availability (365d)",
|
|
"expr": UPTIME_PERCENT_EXPR,
|
|
"kind": "stat",
|
|
"thresholds": UPTIME_PERCENT_THRESHOLDS,
|
|
"unit": "percentunit",
|
|
"decimals": 4,
|
|
"text_mode": "value",
|
|
"instant": True,
|
|
"description": "Rolling 365-day availability from vmalert's precomputed atlas:availability:ratio_365d series. Grafana keeps the last successful rollup for up to 24h so one missed long-window evaluation does not render as No data.",
|
|
},
|
|
{
|
|
"id": 4,
|
|
"title": "Problem Pods",
|
|
"expr": PROBLEM_PODS_EXPR,
|
|
"kind": "stat",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 3},
|
|
],
|
|
},
|
|
"links": overview_link("atlas-pods"),
|
|
},
|
|
{
|
|
"id": 6,
|
|
"title": "CrashLoop / ImagePull",
|
|
"expr": CRASHLOOP_EXPR,
|
|
"kind": "stat",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 3},
|
|
],
|
|
},
|
|
"links": overview_link("atlas-pods"),
|
|
},
|
|
{
|
|
"id": 1,
|
|
"title": "Workers Ready",
|
|
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
|
|
"kind": "gauge",
|
|
"max_value": WORKER_TOTAL,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "orange", "value": WORKER_TOTAL - 2},
|
|
{"color": "yellow", "value": WORKER_TOTAL - 1},
|
|
{"color": "green", "value": WORKER_TOTAL},
|
|
],
|
|
},
|
|
},
|
|
]
|
|
|
|
def gauge_grid(idx):
|
|
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
|
|
x = sum(GAUGE_WIDTHS[:idx])
|
|
return width, x
|
|
|
|
for idx, item in enumerate(row1_stats):
|
|
panel_id = item["id"]
|
|
width, x = gauge_grid(idx)
|
|
grid = {"h": 5, "w": width, "x": x, "y": 0}
|
|
kind = item.get("kind", "gauge")
|
|
if kind == "stat":
|
|
panels.append(
|
|
stat_panel(
|
|
panel_id,
|
|
item["title"],
|
|
item["expr"],
|
|
grid,
|
|
thresholds=item.get("thresholds"),
|
|
legend=None,
|
|
links=item.get("links"),
|
|
text_mode=item.get("text_mode", "value"),
|
|
value_suffix=item.get("value_suffix"),
|
|
unit=item.get("unit", "none"),
|
|
decimals=item.get("decimals"),
|
|
instant=item.get("instant", False),
|
|
description=item.get("description"),
|
|
)
|
|
)
|
|
else:
|
|
panels.append(
|
|
gauge_panel(
|
|
panel_id,
|
|
item["title"],
|
|
item["expr"],
|
|
grid,
|
|
min_value=0,
|
|
max_value=item.get("max_value", 5),
|
|
thresholds=item.get("thresholds"),
|
|
links=item.get("links"),
|
|
)
|
|
)
|
|
|
|
top_health_panels = [
|
|
(7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"),
|
|
(8, "Hottest node: RAM", topk_with_node(node_mem_expr()), "percent"),
|
|
(9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"),
|
|
(10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"),
|
|
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
|
(24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
|
(25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
|
(26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
|
]
|
|
for idx, (panel_id, title, expr, unit) in enumerate(top_health_panels):
|
|
is_hottest_panel = panel_id in {7, 8, 9, 10}
|
|
panels.append(
|
|
stat_panel(
|
|
panel_id,
|
|
title,
|
|
f"{expr}",
|
|
{"h": 2, "w": 3, "x": 3 * idx, "y": 5},
|
|
unit=unit,
|
|
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
|
text_mode="name_and_value" if is_hottest_panel else "value",
|
|
legend="{{node}}" if is_hottest_panel else None,
|
|
instant=is_hottest_panel,
|
|
links=overview_link("atlas-storage" if panel_id in {23, 24, 25, 26} else "atlas-nodes"),
|
|
)
|
|
)
|
|
|
|
mail_bounce_rate_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 5},
|
|
{"color": "orange", "value": 8},
|
|
{"color": "red", "value": 10},
|
|
],
|
|
}
|
|
mail_limit_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 70},
|
|
{"color": "orange", "value": 85},
|
|
{"color": "red", "value": 95},
|
|
],
|
|
}
|
|
mail_success_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "orange", "value": 90},
|
|
{"color": "yellow", "value": 95},
|
|
{"color": "green", "value": 98},
|
|
],
|
|
}
|
|
dark_red = "dark-red"
|
|
dark_orange = "dark-orange"
|
|
dark_yellow = "dark-yellow"
|
|
dark_green = "dark-green"
|
|
dark_blue = "dark-blue"
|
|
test_success_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": dark_red, "value": None},
|
|
{"color": dark_orange, "value": 70},
|
|
{"color": dark_yellow, "value": 85},
|
|
{"color": dark_green, "value": 95},
|
|
{"color": dark_blue, "value": 100},
|
|
],
|
|
}
|
|
fan_intensity_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "#8f1d1d", "value": None},
|
|
{"color": "#c92a2a", "value": 1},
|
|
{"color": "#d95718", "value": 2},
|
|
{"color": "#e06c00", "value": 3},
|
|
{"color": "#d69605", "value": 4},
|
|
{"color": "#d4b106", "value": 5},
|
|
{"color": "#76a935", "value": 6},
|
|
{"color": "#2f9e44", "value": 7},
|
|
{"color": "#2f8599", "value": 8},
|
|
{"color": "#2870b8", "value": 9},
|
|
{"color": "#1f60c4", "value": 10},
|
|
],
|
|
}
|
|
fan_intensity_mappings = [
|
|
{
|
|
"type": "value",
|
|
"options": {
|
|
str(value): {
|
|
"text": "Off" if value == 0 else str(value),
|
|
"color": fan_intensity_thresholds["steps"][value]["color"],
|
|
}
|
|
for value in range(0, 11)
|
|
},
|
|
}
|
|
]
|
|
fan_intensity_expr = (
|
|
f'label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="1"}}), "fan", "Outlet", "__name__", ".*") '
|
|
f'or label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="2"}}), "fan", "Inlet - Inside", "__name__", ".*") '
|
|
f'or label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="3"}}), "fan", "Inlet - Outside", "__name__", ".*") '
|
|
f'or label_replace(max without ({climate_drop_labels}) (typhon_fan_speed_level{{port="4"}}), "fan", "Tent Interior", "__name__", ".*")'
|
|
)
|
|
gitops_health_history_expr = (
|
|
f'label_replace({GITOPS_KUSTOMIZATION_READY_PCT}, "signal", "Kustomizations Ready", "__name__", ".*") '
|
|
f'or label_replace({GITOPS_HELM_READY_PCT}, "signal", "HelmReleases Ready", "__name__", ".*") '
|
|
f'or label_replace({GITOPS_KUSTOMIZATION_NOT_SUSPENDED_PCT}, "signal", "Kustomizations Not Suspended", "__name__", ".*") '
|
|
f'or label_replace({GITOPS_HELM_NOT_SUSPENDED_PCT}, "signal", "HelmReleases Not Suspended", "__name__", ".*")'
|
|
)
|
|
compact_current_text = {"titleSize": 11, "valueSize": 20}
|
|
perfect_count_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": dark_red, "value": None},
|
|
{"color": dark_yellow, "value": max(len(PLATFORM_TEST_SUITE_NAMES) - 2, 1)},
|
|
{"color": dark_green, "value": len(PLATFORM_TEST_SUITE_NAMES) - 1},
|
|
{"color": dark_blue, "value": len(PLATFORM_TEST_SUITE_NAMES)},
|
|
],
|
|
}
|
|
failure_count_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": dark_blue, "value": None},
|
|
{"color": dark_yellow, "value": 1},
|
|
{"color": dark_orange, "value": 3},
|
|
{"color": dark_red, "value": 5},
|
|
],
|
|
}
|
|
overview_avg_coverage = f"(avg(({QUALITY_GATE_COVERAGE_BY_SUITE})) or on() vector(0))"
|
|
overview_category_health = (
|
|
f'avg by (category) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{'
|
|
f'suite=~"{PLATFORM_TEST_SUITE_CANONICAL_MATCHER}",branch!="",branch=~"main|master|origin/main|origin/master",'
|
|
f'category=~"{PLATFORM_TEST_OVERVIEW_CATEGORY_REGEX}"'
|
|
"})"
|
|
)
|
|
for panel_id, title, draw_expr, runtime_expr, y_pos in [
|
|
(40, "Pyrphoros UPS Current", ANANKE_UPS_DRAW_WATTS_DB, ANANKE_UPS_RUNTIME_DB, 7),
|
|
(144, "Statera UPS Current", ANANKE_UPS_DRAW_WATTS_TETHYS, ANANKE_UPS_RUNTIME_TETHYS, 10),
|
|
]:
|
|
panel = stat_panel(
|
|
panel_id,
|
|
title,
|
|
None,
|
|
{"h": 3, "w": 3, "x": 0, "y": y_pos},
|
|
unit="none",
|
|
text_mode="name_and_value",
|
|
targets=[
|
|
{
|
|
"expr": overview_metric_pair_expr(draw_expr, "Draw", runtime_expr, "Runtime"),
|
|
"refId": "A",
|
|
"legendFormat": "{{metric}}",
|
|
"instant": True,
|
|
}
|
|
],
|
|
field_overrides=[
|
|
{"matcher": {"id": "byName", "options": "Draw"}, "properties": [{"id": "unit", "value": "watt"}]},
|
|
{"matcher": {"id": "byName", "options": "Runtime"}, "properties": [{"id": "unit", "value": "s"}]},
|
|
],
|
|
links=overview_link("atlas-power"),
|
|
)
|
|
panel["options"]["text"] = compact_current_text
|
|
panels.append(panel)
|
|
|
|
ups_history = timeseries_panel(
|
|
41,
|
|
"UPS History (Power Draw)",
|
|
None,
|
|
{"h": 6, "w": 6, "x": 3, "y": 7},
|
|
unit="watt",
|
|
targets=[
|
|
{"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB_SERIES, "legendFormat": ANANKE_UPS_DB_NAME},
|
|
{"refId": "B", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES, "legendFormat": ANANKE_UPS_TETHYS_NAME},
|
|
],
|
|
field_overrides=fixed_color_overrides(
|
|
{ANANKE_UPS_DB_NAME: dark_blue, ANANKE_UPS_TETHYS_NAME: dark_yellow}
|
|
),
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
links=overview_link("atlas-power"),
|
|
)
|
|
ups_history["fieldConfig"]["defaults"]["custom"] = {
|
|
"drawStyle": "line",
|
|
"lineInterpolation": "linear",
|
|
"lineWidth": 2,
|
|
"fillOpacity": 22,
|
|
"showPoints": "never",
|
|
"spanNulls": True,
|
|
}
|
|
panels.append(ups_history)
|
|
temp_panel = stat_panel(
|
|
42,
|
|
"Current Enclosure Temperature",
|
|
None,
|
|
{"h": 3, "w": 3, "x": 0, "y": 13},
|
|
unit="none",
|
|
text_mode="name_and_value",
|
|
targets=[
|
|
{
|
|
"expr": overview_metric_pair_expr(
|
|
f"max({climate_temp_series}) or on() vector(0)",
|
|
"°C",
|
|
f"max(({climate_temp_series}) * 9 / 5 + 32) or on() vector(0)",
|
|
"°F",
|
|
),
|
|
"refId": "A",
|
|
"legendFormat": "{{metric}}",
|
|
"instant": True,
|
|
}
|
|
],
|
|
field_overrides=[
|
|
{"matcher": {"id": "byName", "options": "°C"}, "properties": [{"id": "unit", "value": "celsius"}]},
|
|
{"matcher": {"id": "byName", "options": "°F"}, "properties": [{"id": "unit", "value": "fahrenheit"}]},
|
|
],
|
|
links=overview_link("atlas-power"),
|
|
)
|
|
temp_panel["options"]["text"] = compact_current_text
|
|
panels.append(temp_panel)
|
|
climate_panel = stat_panel(
|
|
143,
|
|
"Current Enclosure Climate",
|
|
None,
|
|
{"h": 3, "w": 3, "x": 0, "y": 16},
|
|
unit="none",
|
|
text_mode="name_and_value",
|
|
targets=[
|
|
{
|
|
"expr": overview_metric_pair_expr(
|
|
f"max({climate_humidity_series}) or on() vector(0)",
|
|
"%RH",
|
|
f"max({climate_pressure_series}) or on() vector(0)",
|
|
"kPa",
|
|
),
|
|
"refId": "A",
|
|
"legendFormat": "{{metric}}",
|
|
"instant": True,
|
|
}
|
|
],
|
|
field_overrides=[
|
|
{"matcher": {"id": "byName", "options": "%RH"}, "properties": [{"id": "unit", "value": "suffix:%RH"}]},
|
|
{"matcher": {"id": "byName", "options": "kPa"}, "properties": [{"id": "unit", "value": "suffix:kPa"}]},
|
|
],
|
|
links=overview_link("atlas-power"),
|
|
)
|
|
climate_panel["options"]["text"] = compact_current_text
|
|
panels.append(climate_panel)
|
|
panels.append(
|
|
timeseries_panel(
|
|
43,
|
|
"Enclosure Climate History",
|
|
None,
|
|
{"h": 6, "w": 6, "x": 3, "y": 13},
|
|
unit="none",
|
|
targets=[
|
|
{"refId": "A", "expr": climate_temp_series, "legendFormat": "C"},
|
|
{"refId": "B", "expr": climate_humidity_series, "legendFormat": "RH"},
|
|
{"refId": "C", "expr": climate_pressure_series, "legendFormat": "P"},
|
|
{"refId": "D", "expr": f"(min_over_time({climate_temp_series}[$__range]) - 0.08)", "legendFormat": "C bound min"},
|
|
{"refId": "E", "expr": f"(max_over_time({climate_temp_series}[$__range]) + 0.08)", "legendFormat": "C bound max"},
|
|
{"refId": "F", "expr": f"clamp_min((min_over_time({climate_humidity_series}[$__range]) - 0.35), 0)", "legendFormat": "RH bound min"},
|
|
{"refId": "G", "expr": f"clamp_max((max_over_time({climate_humidity_series}[$__range]) + 0.35), 100)", "legendFormat": "RH bound max"},
|
|
{"refId": "H", "expr": f"clamp_min((min_over_time({climate_pressure_series}[$__range]) - 0.03), 0)", "legendFormat": "P bound min"},
|
|
{"refId": "I", "expr": f"(max_over_time({climate_pressure_series}[$__range]) + 0.03)", "legendFormat": "P bound max"},
|
|
],
|
|
field_overrides=[
|
|
{
|
|
"matcher": {"id": "byName", "options": "C"},
|
|
"properties": [
|
|
{"id": "unit", "value": "suffix:°C"},
|
|
{"id": "decimals", "value": 2},
|
|
{"id": "custom.axisPlacement", "value": "left"},
|
|
{"id": "custom.axisCenteredZero", "value": False},
|
|
],
|
|
},
|
|
{
|
|
"matcher": {"id": "byRegexp", "options": "C bound .*"},
|
|
"properties": [
|
|
{"id": "unit", "value": "suffix:°C"},
|
|
{"id": "custom.axisPlacement", "value": "left"},
|
|
{"id": "custom.axisCenteredZero", "value": False},
|
|
{"id": "custom.hideFrom", "value": {"legend": True, "tooltip": True, "viz": False}},
|
|
{"id": "custom.lineWidth", "value": 0},
|
|
{"id": "custom.fillOpacity", "value": 0},
|
|
{"id": "custom.showPoints", "value": "never"},
|
|
{"id": "color", "value": {"mode": "fixed", "fixedColor": "transparent"}},
|
|
],
|
|
},
|
|
{
|
|
"matcher": {"id": "byName", "options": "RH"},
|
|
"properties": [
|
|
{"id": "unit", "value": "suffix:%"},
|
|
{"id": "decimals", "value": 2},
|
|
{"id": "custom.axisPlacement", "value": "right"},
|
|
{"id": "custom.axisCenteredZero", "value": False},
|
|
],
|
|
},
|
|
{
|
|
"matcher": {"id": "byRegexp", "options": "RH bound .*"},
|
|
"properties": [
|
|
{"id": "unit", "value": "suffix:%"},
|
|
{"id": "custom.axisPlacement", "value": "right"},
|
|
{"id": "custom.axisCenteredZero", "value": False},
|
|
{"id": "custom.hideFrom", "value": {"legend": True, "tooltip": True, "viz": False}},
|
|
{"id": "custom.lineWidth", "value": 0},
|
|
{"id": "custom.fillOpacity", "value": 0},
|
|
{"id": "custom.showPoints", "value": "never"},
|
|
{"id": "color", "value": {"mode": "fixed", "fixedColor": "transparent"}},
|
|
],
|
|
},
|
|
{
|
|
"matcher": {"id": "byName", "options": "P"},
|
|
"properties": [
|
|
{"id": "unit", "value": "suffix:kPa"},
|
|
{"id": "custom.axisPlacement", "value": "right"},
|
|
{"id": "decimals", "value": 2},
|
|
{"id": "custom.axisCenteredZero", "value": False},
|
|
],
|
|
},
|
|
{
|
|
"matcher": {"id": "byRegexp", "options": "P bound .*"},
|
|
"properties": [
|
|
{"id": "unit", "value": "suffix:kPa"},
|
|
{"id": "custom.axisPlacement", "value": "right"},
|
|
{"id": "custom.axisCenteredZero", "value": False},
|
|
{"id": "custom.hideFrom", "value": {"legend": True, "tooltip": True, "viz": False}},
|
|
{"id": "custom.lineWidth", "value": 0},
|
|
{"id": "custom.fillOpacity", "value": 0},
|
|
{"id": "custom.showPoints", "value": "never"},
|
|
{"id": "color", "value": {"mode": "fixed", "fixedColor": "transparent"}},
|
|
],
|
|
},
|
|
],
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
links=overview_link("atlas-power"),
|
|
description="Temperature on left axis, humidity and pressure on right axis with dynamic bound series so small swings remain visible.",
|
|
)
|
|
)
|
|
panels[-1]["fieldConfig"]["defaults"]["custom"] = {
|
|
"drawStyle": "line",
|
|
"lineInterpolation": "linear",
|
|
"lineWidth": 2,
|
|
"fillOpacity": 10,
|
|
"showPoints": "never",
|
|
"spanNulls": True,
|
|
}
|
|
fan_panel = state_timeline_panel(
|
|
141,
|
|
"Fan Intensity History",
|
|
fan_intensity_expr,
|
|
{"h": 6, "w": 6, "x": 9, "y": 13},
|
|
unit="none",
|
|
min_value=0,
|
|
max_value=10,
|
|
legend="{{fan}}",
|
|
thresholds=fan_intensity_thresholds,
|
|
links=overview_link("atlas-power"),
|
|
description="Fan intensity lanes on the 0-10 controller scale. Cooler colors are quiet/low intensity; warmer colors mean the enclosure is pushing harder.",
|
|
)
|
|
fan_panel["options"]["legend"] = {"displayMode": "list", "placement": "bottom", "showLegend": False}
|
|
fan_panel["options"]["mergeValues"] = False
|
|
fan_panel["options"]["showValue"] = "auto"
|
|
fan_panel["options"]["tooltip"] = {"mode": "multi", "sort": "none"}
|
|
fan_panel["fieldConfig"]["defaults"]["mappings"] = fan_intensity_mappings
|
|
panels.append(fan_panel)
|
|
flux_source = stat_panel(
|
|
140,
|
|
"Flux Source",
|
|
None,
|
|
{"h": 2, "w": 3, "x": 21, "y": 7},
|
|
unit="none",
|
|
text_mode="name",
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": dark_red, "value": None},
|
|
{"color": dark_blue, "value": 1},
|
|
],
|
|
},
|
|
targets=[
|
|
{
|
|
"expr": f"{GITOPS_SOURCE_INFO} or on() vector(0)",
|
|
"refId": "A",
|
|
"legendFormat": "{{branch}}",
|
|
"instant": True,
|
|
}
|
|
],
|
|
links=overview_link("atlas-gitops"),
|
|
description="Flux GitRepository branch reported by Ananke. Revision and object detail live in Atlas GitOps.",
|
|
)
|
|
flux_source["options"]["graphMode"] = "none"
|
|
flux_source["options"]["text"] = {"titleSize": 10, "valueSize": 14}
|
|
panels.append(flux_source)
|
|
for panel_id, title, expr, y_pos, unit, decimals, thresholds, links in [
|
|
(151, "Current Gate Health", PLATFORM_TEST_CURRENT_GATE_HEALTH, 9, "percent", 1, test_success_thresholds, "atlas-testing"),
|
|
(152, "CI Run Success (24h)", TEST_SUCCESS_RATE_24H, 11, "percent", 1, test_success_thresholds, "atlas-testing"),
|
|
(153, "Failed Runs (24h)", TEST_FAILURES_24H_TOTAL, 13, "none", 0, failure_count_thresholds, "atlas-testing"),
|
|
(154, "Suites With Runs (24h)", PLATFORM_TEST_ACTIVE_SUITES_24H, 15, "none", 0, perfect_count_thresholds, "atlas-testing"),
|
|
(155, "Avg Coverage", overview_avg_coverage, 17, "percent", 1, test_success_thresholds, "atlas-testing"),
|
|
]:
|
|
rail_panel = stat_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
{"h": 2, "w": 3, "x": 21, "y": y_pos},
|
|
unit=unit,
|
|
decimals=decimals,
|
|
instant=True,
|
|
thresholds=thresholds,
|
|
links=overview_link(links),
|
|
)
|
|
rail_panel["options"]["graphMode"] = "none"
|
|
rail_panel["options"]["text"] = {"titleSize": 10, "valueSize": 19}
|
|
panels.append(rail_panel)
|
|
panels.append(
|
|
state_timeline_panel(
|
|
150,
|
|
"GitOps Health",
|
|
gitops_health_history_expr,
|
|
{"h": 6, "w": 6, "x": 15, "y": 7},
|
|
unit="percent",
|
|
min_value=0,
|
|
max_value=100,
|
|
legend="{{signal}}",
|
|
thresholds=test_success_thresholds,
|
|
links=overview_link("atlas-gitops"),
|
|
description="GitOps readiness and suspension health over time. Blue means perfect; warmer colors mean a readiness or suspension problem appeared.",
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
bargauge_panel(
|
|
44,
|
|
"One-off Job Pods (age hours)",
|
|
ONEOFF_JOB_POD_AGE_HOURS,
|
|
{"h": 5, "w": 8, "x": 0, "y": 32},
|
|
unit="h",
|
|
instant=True,
|
|
legend="{{namespace}}/{{pod}}",
|
|
thresholds=age_thresholds,
|
|
limit=12,
|
|
decimals=2,
|
|
links=overview_link("atlas-testing"),
|
|
include_color=False,
|
|
)
|
|
)
|
|
ariadne_volume = timeseries_panel(
|
|
45,
|
|
"Ariadne Run Volume",
|
|
None,
|
|
{"h": 6, "w": 6, "x": 9, "y": 7},
|
|
unit="none",
|
|
targets=[
|
|
{"expr": f"{ARIADNE_TASK_ATTEMPTS_SERIES} or on() vector(0)", "refId": "A", "legendFormat": "Attempts"},
|
|
{"expr": f"{ARIADNE_TASK_FAILURES_SERIES} or on() vector(0)", "refId": "B", "legendFormat": "Failures"},
|
|
],
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
links=overview_link("atlas-testing"),
|
|
)
|
|
ariadne_volume["fieldConfig"]["overrides"] = fixed_color_overrides(
|
|
{"Attempts": dark_blue, "Failures": dark_red}
|
|
)
|
|
panels.append(apply_bar_timeseries_style(ariadne_volume, stacked=False))
|
|
panels.append(
|
|
state_timeline_panel(
|
|
46,
|
|
"Test Category Health",
|
|
overview_category_health,
|
|
{"h": 6, "w": 6, "x": 15, "y": 13},
|
|
unit="percent",
|
|
min_value=0,
|
|
max_value=100,
|
|
legend="{{category}}",
|
|
thresholds=test_success_thresholds,
|
|
links=overview_link("atlas-testing"),
|
|
description="Health by major test category across all suites over the last 24 hours. Skipped tests are healthy; failures and errors lower the lane.",
|
|
)
|
|
)
|
|
panels[-1]["options"]["legend"] = {"displayMode": "list", "placement": "bottom", "showLegend": False}
|
|
panels[-1]["options"]["mergeValues"] = False
|
|
panels[-1]["options"]["showValue"] = "auto"
|
|
panels[-1]["options"]["tooltip"] = {"mode": "multi", "sort": "none"}
|
|
panels[-1]["options"]["rowHeight"] = 0.9
|
|
panels[-1]["timeFrom"] = "24h"
|
|
for panel_id, title, metric, x_pos, description in [
|
|
(
|
|
142,
|
|
"Jenkins Last Success (h, newest first)",
|
|
"ariadne_jenkins_build_weather_job_last_success_timestamp_seconds",
|
|
8,
|
|
"Top 6 most recent Jenkins successes by age (newest first). Green means last run succeeded; red means last run did not succeed. Use Atlas Jobs for the full list.",
|
|
),
|
|
(
|
|
243,
|
|
"Jenkins Last Failure (h, newest first)",
|
|
"ariadne_jenkins_build_weather_job_last_failure_timestamp_seconds",
|
|
12,
|
|
"Top 6 most recent Jenkins failures by age (newest first). Green means last run succeeded; red means last run did not succeed. Use Atlas Jobs for the full list.",
|
|
),
|
|
]:
|
|
base_expr = f"min by (exported_job,job_url,weather_icon) ((time() - {metric}) / 3600)"
|
|
topk_expr = f"sort(bottomk(6, {base_expr}))"
|
|
success_expr = (
|
|
f'label_replace(({topk_expr}) and on(exported_job,job_url,weather_icon) '
|
|
'(max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) == 1), '
|
|
'"run_state", "ok", "exported_job", ".*")'
|
|
)
|
|
failure_expr = (
|
|
f'label_replace(({topk_expr}) and on(exported_job,job_url,weather_icon) '
|
|
'(max by (exported_job,job_url,weather_icon) (ariadne_jenkins_build_weather_job_last_status) != 1), '
|
|
'"run_state", "bad", "exported_job", ".*")'
|
|
)
|
|
panels.append(
|
|
{
|
|
"id": panel_id,
|
|
"type": "stat",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": {"h": 5, "w": 4, "x": x_pos, "y": 32},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": f"sort(({success_expr}) or ({failure_expr}))",
|
|
"instant": True,
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "h",
|
|
"decimals": 1,
|
|
"min": 0,
|
|
"displayName": "${__field.labels.weather_icon} ${__field.labels.exported_job}",
|
|
"links": [
|
|
{
|
|
"title": "Open Jenkins job",
|
|
"url": "https://ci.bstein.dev/job/${__field.labels.exported_job}/",
|
|
"targetBlank": True,
|
|
}
|
|
],
|
|
},
|
|
"overrides": [
|
|
{
|
|
"matcher": {"id": "byRegexp", "options": '.*run_state="ok".*'},
|
|
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}],
|
|
},
|
|
{
|
|
"matcher": {"id": "byRegexp", "options": '.*run_state="bad".*'},
|
|
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}],
|
|
},
|
|
],
|
|
},
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "none",
|
|
"justifyMode": "left",
|
|
"orientation": "horizontal",
|
|
"wideLayout": True,
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
|
"textMode": "name_and_value",
|
|
"text": {"titleSize": 11, "valueSize": 11},
|
|
},
|
|
"transformations": [{"id": "sortBy", "options": {"fields": ["Value"], "order": "asc"}}],
|
|
"links": overview_link("atlas-testing"),
|
|
"description": description,
|
|
}
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
47,
|
|
"PVC Backup Health / Age",
|
|
overview_pvc_backup_age,
|
|
{"h": 5, "w": 8, "x": 16, "y": 32},
|
|
unit="h",
|
|
instant=True,
|
|
legend="{{namespace}}/{{pvc}}",
|
|
sort_order="desc",
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 20},
|
|
{"color": "orange", "value": 40},
|
|
{"color": "red", "value": 50},
|
|
],
|
|
},
|
|
include_color=False,
|
|
)
|
|
)
|
|
panels[-1]["links"] = overview_link("atlas-storage")
|
|
panels[-1]["description"] = (
|
|
"Backup age in hours computed from last-success timestamps for restic-managed PVCs (nightly target: <=20h green, <40h yellow, <50h orange, >=50h red). PVCs that have backup history but currently no successful backup (missing/no_completed/error) are pinned to 999h for visibility."
|
|
)
|
|
|
|
panels.append(
|
|
stat_panel(
|
|
30,
|
|
"Mail Sent (1d)",
|
|
'max(postmark_outbound_sent{window="1d"})',
|
|
{"h": 2, "w": 4, "x": 0, "y": 19},
|
|
unit="none",
|
|
links=overview_link("atlas-mail"),
|
|
)
|
|
)
|
|
panels.append(
|
|
{
|
|
"id": 31,
|
|
"type": "stat",
|
|
"title": "Mail Bounces (1d)",
|
|
"datasource": PROM_DS,
|
|
"gridPos": {"h": 2, "w": 4, "x": 8, "y": 19},
|
|
"targets": [
|
|
{
|
|
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
|
|
"refId": "A",
|
|
"legendFormat": "Rate",
|
|
},
|
|
{
|
|
"expr": 'max(postmark_outbound_bounced{window="1d"})',
|
|
"refId": "B",
|
|
"legendFormat": "Count",
|
|
},
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "thresholds"},
|
|
"custom": {"displayMode": "auto"},
|
|
"thresholds": mail_bounce_rate_thresholds,
|
|
"unit": "none",
|
|
},
|
|
"overrides": [
|
|
{
|
|
"matcher": {"id": "byName", "options": "Rate"},
|
|
"properties": [{"id": "unit", "value": "percent"}],
|
|
},
|
|
{
|
|
"matcher": {"id": "byName", "options": "Count"},
|
|
"properties": [{"id": "unit", "value": "none"}],
|
|
},
|
|
],
|
|
},
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "area",
|
|
"justifyMode": "center",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
|
"textMode": "name_and_value",
|
|
},
|
|
"links": overview_link("atlas-mail"),
|
|
}
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
32,
|
|
"Mail Success Rate (1d)",
|
|
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
|
|
{"h": 2, "w": 4, "x": 4, "y": 19},
|
|
unit="percent",
|
|
thresholds=mail_success_thresholds,
|
|
decimals=1,
|
|
links=overview_link("atlas-mail"),
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
33,
|
|
"Mail Limit Used (30d)",
|
|
"max(postmark_sending_limit_used_percent)",
|
|
{"h": 2, "w": 4, "x": 12, "y": 19},
|
|
unit="percent",
|
|
thresholds=mail_limit_thresholds,
|
|
decimals=1,
|
|
links=overview_link("atlas-mail"),
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
34,
|
|
"Postgres Connections Used",
|
|
POSTGRES_CONN_USED,
|
|
{"h": 2, "w": 4, "x": 16, "y": 19},
|
|
decimals=0,
|
|
text_mode="name_and_value",
|
|
legend="{{conn}}",
|
|
instant=True,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
35,
|
|
"Postgres Hottest Connections",
|
|
POSTGRES_CONN_HOTTEST,
|
|
{"h": 2, "w": 4, "x": 20, "y": 19},
|
|
unit="none",
|
|
decimals=0,
|
|
text_mode="name_and_value",
|
|
legend="{{datname}}",
|
|
instant=True,
|
|
)
|
|
)
|
|
|
|
cpu_scope = "$namespace_scope_cpu"
|
|
gpu_scope = "$namespace_scope_gpu"
|
|
ram_scope = "$namespace_scope_ram"
|
|
|
|
panels.append(
|
|
pie_panel(
|
|
11,
|
|
"Namespace CPU Share",
|
|
namespace_cpu_share_expr(cpu_scope),
|
|
{"h": 9, "w": 8, "x": 0, "y": 23},
|
|
links=namespace_scope_links("namespace_scope_cpu"),
|
|
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
|
)
|
|
)
|
|
panels.append(
|
|
pie_panel(
|
|
12,
|
|
"Namespace GPU Utilization",
|
|
namespace_gpu_share_expr(gpu_scope),
|
|
{"h": 9, "w": 8, "x": 8, "y": 23},
|
|
links=namespace_scope_links("namespace_scope_gpu"),
|
|
description="Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero.",
|
|
)
|
|
)
|
|
panels.append(
|
|
pie_panel(
|
|
13,
|
|
"Namespace RAM Share",
|
|
namespace_ram_share_expr(ram_scope),
|
|
{"h": 9, "w": 8, "x": 16, "y": 23},
|
|
links=namespace_scope_links("namespace_scope_ram"),
|
|
description="Shares are normalized within the selected filter. Switching scope changes the denominator.",
|
|
)
|
|
)
|
|
|
|
worker_filter = f"{WORKER_REGEX}"
|
|
panels.append(
|
|
timeseries_panel(
|
|
14,
|
|
"Worker Node CPU",
|
|
node_cpu_expr(worker_filter),
|
|
{"h": 12, "w": 12, "x": 0, "y": 44},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
links=overview_link("atlas-nodes"),
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
15,
|
|
"Worker Node RAM",
|
|
node_mem_expr(worker_filter),
|
|
{"h": 12, "w": 12, "x": 12, "y": 44},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
links=overview_link("atlas-nodes"),
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
timeseries_panel(
|
|
16,
|
|
"Control plane CPU",
|
|
node_cpu_expr(CONTROL_ALL_REGEX),
|
|
{"h": 10, "w": 12, "x": 0, "y": 56},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
17,
|
|
"Control plane RAM",
|
|
node_mem_expr(CONTROL_ALL_REGEX),
|
|
{"h": 10, "w": 12, "x": 12, "y": 56},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
pie_panel(
|
|
28,
|
|
"Node Pod Share",
|
|
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
|
|
{"h": 10, "w": 12, "x": 0, "y": 66},
|
|
)
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
29,
|
|
"Top Nodes by Pod Count",
|
|
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
|
|
{"h": 10, "w": 12, "x": 12, "y": 66},
|
|
unit="none",
|
|
limit=12,
|
|
decimals=0,
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "orange", "value": 75},
|
|
{"color": "red", "value": 100},
|
|
],
|
|
},
|
|
instant=True,
|
|
include_color=False,
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
timeseries_panel(
|
|
18,
|
|
"Cluster Ingress Throughput",
|
|
NET_INGRESS_EXPR,
|
|
{"h": 7, "w": 8, "x": 0, "y": 37},
|
|
unit="Bps",
|
|
legend="Ingress (Traefik)",
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
links=overview_link("atlas-network"),
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
19,
|
|
"Cluster Egress Throughput",
|
|
NET_EGRESS_EXPR,
|
|
{"h": 7, "w": 8, "x": 8, "y": 37},
|
|
unit="Bps",
|
|
legend="Egress (Traefik)",
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
links=overview_link("atlas-network"),
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
20,
|
|
"Intra-Cluster Throughput",
|
|
NET_INTERNAL_EXPR,
|
|
{"h": 7, "w": 8, "x": 16, "y": 37},
|
|
unit="Bps",
|
|
legend="Internal traffic",
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
links=overview_link("atlas-network"),
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
timeseries_panel(
|
|
21,
|
|
"Root Filesystem Usage",
|
|
root_usage_expr(),
|
|
{"h": 16, "w": 12, "x": 0, "y": 76},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
links=overview_link("atlas-storage"),
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
22,
|
|
"Nodes Closest to Full Astraios Disks",
|
|
astraios_usage_expr(),
|
|
{"h": 16, "w": 12, "x": 12, "y": 76},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="1w",
|
|
links=overview_link("atlas-storage"),
|
|
)
|
|
)
|
|
apply_panel_descriptions(panels, OVERVIEW_PANEL_DESCRIPTIONS)
|
|
return {
|
|
"uid": "atlas-overview",
|
|
"title": "Atlas Overview",
|
|
"folderUid": PUBLIC_FOLDER,
|
|
"editable": False,
|
|
"annotations": {"list": []},
|
|
"panels": panels,
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "overview"],
|
|
"templating": {
|
|
"list": [
|
|
namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
|
|
namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
|
|
namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
|
|
]
|
|
},
|
|
"time": {"from": "now-1h", "to": "now"},
|
|
"refresh": "1m",
|
|
"links": link_to("atlas-testing"),
|
|
}
|
|
|
|
|
|
def build_pods_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"Problem Pods",
|
|
PROBLEM_PODS_EXPR,
|
|
{"h": 4, "w": 6, "x": 0, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"CrashLoop / ImagePull",
|
|
CRASHLOOP_EXPR,
|
|
{"h": 4, "w": 6, "x": 6, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Stuck Terminating (>10m)",
|
|
STUCK_TERMINATING_EXPR,
|
|
{"h": 4, "w": 6, "x": 12, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
4,
|
|
"Control Plane Workloads",
|
|
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
|
|
{"h": 4, "w": 6, "x": 18, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
table_panel(
|
|
5,
|
|
"Pods Not Running",
|
|
PROBLEM_TABLE_EXPR,
|
|
{"h": 10, "w": 24, "x": 0, "y": 4},
|
|
unit="s",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
6,
|
|
"CrashLoop / ImagePull",
|
|
CRASHLOOP_TABLE_EXPR,
|
|
{"h": 10, "w": 24, "x": 0, "y": 14},
|
|
unit="s",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
7,
|
|
"Terminating >10m",
|
|
STUCK_TABLE_EXPR,
|
|
{"h": 10, "w": 24, "x": 0, "y": 24},
|
|
unit="s",
|
|
transformations=[
|
|
{"id": "labelsToFields", "options": {}},
|
|
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
|
|
],
|
|
)
|
|
)
|
|
panels.append(
|
|
pie_panel(
|
|
8,
|
|
"Node Pod Share",
|
|
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
|
|
{"h": 8, "w": 12, "x": 12, "y": 34},
|
|
)
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
9,
|
|
"Top Nodes by Pod Count",
|
|
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
|
|
{"h": 8, "w": 12, "x": 0, "y": 34},
|
|
unit="none",
|
|
limit=12,
|
|
decimals=0,
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "orange", "value": 75},
|
|
{"color": "red", "value": 100},
|
|
],
|
|
},
|
|
instant=True,
|
|
)
|
|
)
|
|
|
|
share_expr = (
|
|
'(sum by (namespace,node) (kube_pod_info{pod!="" , node!=""}) '
|
|
'/ on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=""}), 1) * 100)'
|
|
)
|
|
rank_terms = [
|
|
f"(sum by (node) (kube_node_info{{node=\"{node}\"}}) * 0 + {idx * 1e-3})"
|
|
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
|
|
]
|
|
rank_expr = " or ".join(rank_terms)
|
|
score_expr = f"{share_expr} + on(node) group_left() ({rank_expr})"
|
|
mask_expr = (
|
|
f"{score_expr} == bool on(namespace) group_left() "
|
|
f"(max by (namespace) ({score_expr}))"
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
10,
|
|
"Namespace Plurality by Node v27",
|
|
(
|
|
f"{share_expr} * on(namespace,node) group_left() "
|
|
f"({mask_expr})"
|
|
),
|
|
{"h": 8, "w": 24, "x": 0, "y": 42},
|
|
unit="percent",
|
|
transformations=[
|
|
{"id": "labelsToFields", "options": {}},
|
|
{"id": "organize", "options": {"excludeByName": {"Time": True}}},
|
|
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 0}},
|
|
{
|
|
"id": "sortBy",
|
|
"options": {"fields": ["Value"], "order": "desc"},
|
|
},
|
|
{
|
|
"id": "groupBy",
|
|
"options": {
|
|
"fields": {
|
|
"namespace": {
|
|
"aggregations": [
|
|
{"field": "Value", "operation": "max"},
|
|
{"field": "node", "operation": "first"},
|
|
]
|
|
}
|
|
},
|
|
"rowBy": ["namespace"],
|
|
},
|
|
},
|
|
],
|
|
instant=True,
|
|
options={"showColumnFilters": False},
|
|
filterable=False,
|
|
footer={"show": False, "fields": "", "calcs": []},
|
|
format="table",
|
|
)
|
|
)
|
|
|
|
return {
|
|
"uid": "atlas-pods",
|
|
"title": "Atlas Pods",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "pods"],
|
|
}
|
|
|
|
|
|
def build_nodes_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"Worker Nodes Ready",
|
|
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
|
|
{"h": 4, "w": 8, "x": 0, "y": 0},
|
|
value_suffix=WORKER_SUFFIX,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"Control Plane Ready",
|
|
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
|
|
{"h": 4, "w": 8, "x": 8, "y": 0},
|
|
value_suffix=CONTROL_SUFFIX,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Control Plane Workloads",
|
|
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
|
|
{"h": 4, "w": 8, "x": 16, "y": 0},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
9,
|
|
"API Server 5xx rate",
|
|
APISERVER_5XX_RATE,
|
|
{"h": 4, "w": 8, "x": 0, "y": 4},
|
|
unit="req/s",
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 0.05},
|
|
{"color": "orange", "value": 0.2},
|
|
{"color": "red", "value": 0.5},
|
|
],
|
|
},
|
|
decimals=3,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
10,
|
|
"API Server P99 latency",
|
|
APISERVER_P99_LATENCY_MS,
|
|
{"h": 4, "w": 8, "x": 8, "y": 4},
|
|
unit="ms",
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 250},
|
|
{"color": "orange", "value": 400},
|
|
{"color": "red", "value": 600},
|
|
],
|
|
},
|
|
decimals=1,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
11,
|
|
"etcd P99 latency",
|
|
ETCD_P99_LATENCY_MS,
|
|
{"h": 4, "w": 8, "x": 16, "y": 4},
|
|
unit="ms",
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "orange", "value": 100},
|
|
{"color": "red", "value": 200},
|
|
],
|
|
},
|
|
decimals=1,
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
4,
|
|
"Node CPU",
|
|
node_cpu_expr(),
|
|
{"h": 9, "w": 24, "x": 0, "y": 8},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
5,
|
|
"Node RAM",
|
|
node_mem_expr(),
|
|
{"h": 9, "w": 24, "x": 0, "y": 17},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
6,
|
|
"Control Plane (incl. titan-db) CPU",
|
|
node_cpu_expr(CONTROL_ALL_REGEX),
|
|
{"h": 9, "w": 12, "x": 0, "y": 26},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
7,
|
|
"Control Plane (incl. titan-db) RAM",
|
|
node_mem_expr(CONTROL_ALL_REGEX),
|
|
{"h": 9, "w": 12, "x": 12, "y": 26},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
8,
|
|
"Root Filesystem Usage",
|
|
root_usage_expr(),
|
|
{"h": 9, "w": 24, "x": 0, "y": 35},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
9,
|
|
"Astraios Usage",
|
|
astraios_usage_expr(),
|
|
{"h": 9, "w": 24, "x": 0, "y": 44},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
)
|
|
)
|
|
return {
|
|
"uid": "atlas-nodes",
|
|
"title": "Atlas Nodes",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "nodes"],
|
|
}
|
|
|
|
|
|
def build_storage_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"Astreae Usage",
|
|
astreae_usage_expr("/mnt/astreae"),
|
|
{"h": 5, "w": 6, "x": 0, "y": 0},
|
|
unit="percent",
|
|
thresholds=PERCENT_THRESHOLDS,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"Asteria Usage",
|
|
astreae_usage_expr("/mnt/asteria"),
|
|
{"h": 5, "w": 6, "x": 6, "y": 0},
|
|
unit="percent",
|
|
thresholds=PERCENT_THRESHOLDS,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Astreae Free",
|
|
astreae_free_expr("/mnt/astreae"),
|
|
{"h": 5, "w": 6, "x": 12, "y": 0},
|
|
unit="decbytes",
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
4,
|
|
"Asteria Free",
|
|
astreae_free_expr("/mnt/asteria"),
|
|
{"h": 5, "w": 6, "x": 18, "y": 0},
|
|
unit="decbytes",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
5,
|
|
"Astreae Per-Node Usage",
|
|
filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX),
|
|
{"h": 9, "w": 12, "x": 0, "y": 5},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
6,
|
|
"Asteria Per-Node Usage",
|
|
filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX),
|
|
{"h": 9, "w": 12, "x": 12, "y": 5},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
7,
|
|
"Astreae Usage History",
|
|
astreae_usage_expr("/mnt/astreae"),
|
|
{"h": 9, "w": 12, "x": 0, "y": 14},
|
|
unit="percent",
|
|
time_from="90d",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
8,
|
|
"Asteria Usage History",
|
|
astreae_usage_expr("/mnt/asteria"),
|
|
{"h": 9, "w": 12, "x": 12, "y": 14},
|
|
unit="percent",
|
|
time_from="90d",
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
30,
|
|
"Maintenance Sweepers Ready",
|
|
'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100',
|
|
{"h": 4, "w": 12, "x": 0, "y": 44},
|
|
unit="percent",
|
|
thresholds=PERCENT_THRESHOLDS,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
31,
|
|
"Maintenance Cron Freshness (s)",
|
|
'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})',
|
|
{"h": 4, "w": 12, "x": 12, "y": 44},
|
|
unit="s",
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 3600},
|
|
{"color": "red", "value": 10800},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
return {
|
|
"uid": "atlas-storage",
|
|
"title": "Atlas Storage",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "storage"],
|
|
}
|
|
|
|
|
|
def build_network_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"Ingress Success Rate (5m)",
|
|
TRAEFIK_SLI_5M,
|
|
{"h": 4, "w": 6, "x": 0, "y": 0},
|
|
unit="percentunit",
|
|
decimals=2,
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "orange", "value": 0.995},
|
|
{"color": "yellow", "value": 0.999},
|
|
{"color": "green", "value": 0.9995},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"Error Budget Burn (1h)",
|
|
traefik_burn("1h"),
|
|
{"h": 4, "w": 6, "x": 6, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 4},
|
|
],
|
|
},
|
|
decimals=2,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Error Budget Burn (6h)",
|
|
traefik_burn("6h"),
|
|
{"h": 4, "w": 6, "x": 12, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 4},
|
|
],
|
|
},
|
|
decimals=2,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
4,
|
|
"Edge P99 Latency (ms)",
|
|
TRAEFIK_P99_LATENCY_MS,
|
|
{"h": 4, "w": 6, "x": 18, "y": 0},
|
|
unit="ms",
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 200},
|
|
{"color": "orange", "value": 350},
|
|
{"color": "red", "value": 500},
|
|
],
|
|
},
|
|
decimals=1,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
5,
|
|
"Ingress Traffic",
|
|
NET_INGRESS_EXPR,
|
|
{"h": 4, "w": 8, "x": 0, "y": 4},
|
|
unit="Bps",
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
6,
|
|
"Egress Traffic",
|
|
NET_EGRESS_EXPR,
|
|
{"h": 4, "w": 8, "x": 8, "y": 4},
|
|
unit="Bps",
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
7,
|
|
"Intra-Cluster Traffic",
|
|
NET_INTERNAL_EXPR,
|
|
{"h": 4, "w": 8, "x": 16, "y": 4},
|
|
unit="Bps",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
8,
|
|
"Per-Node Throughput",
|
|
f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
|
|
{"h": 8, "w": 24, "x": 0, "y": 8},
|
|
unit="Bps",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
9,
|
|
"Top Namespaces",
|
|
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
|
|
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
|
|
{"h": 9, "w": 12, "x": 0, "y": 16},
|
|
unit="Bps",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
10,
|
|
"Top Pods",
|
|
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
|
|
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
|
|
{"h": 9, "w": 12, "x": 12, "y": 16},
|
|
unit="Bps",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
11,
|
|
"Traefik Routers (req/s)",
|
|
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
|
|
{"h": 9, "w": 12, "x": 0, "y": 25},
|
|
unit="req/s",
|
|
legend="{{router}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
12,
|
|
"Traefik Entrypoints (req/s)",
|
|
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
|
|
{"h": 9, "w": 12, "x": 12, "y": 25},
|
|
unit="req/s",
|
|
legend="{{entrypoint}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
return {
|
|
"uid": "atlas-network",
|
|
"title": "Atlas Network",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "network"],
|
|
}
|
|
|
|
|
|
def build_mail_dashboard():
|
|
panels = []
|
|
|
|
bounce_rate_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 5},
|
|
{"color": "orange", "value": 8},
|
|
{"color": "red", "value": 10},
|
|
],
|
|
}
|
|
limit_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 70},
|
|
{"color": "orange", "value": 85},
|
|
{"color": "red", "value": 95},
|
|
],
|
|
}
|
|
success_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "orange", "value": 90},
|
|
{"color": "yellow", "value": 95},
|
|
{"color": "green", "value": 98},
|
|
],
|
|
}
|
|
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"Sent (1d)",
|
|
'max(postmark_outbound_sent{window="1d"})',
|
|
{"h": 4, "w": 6, "x": 0, "y": 0},
|
|
decimals=0,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"Sent (7d)",
|
|
'max(postmark_outbound_sent{window="7d"})',
|
|
{"h": 4, "w": 6, "x": 6, "y": 0},
|
|
decimals=0,
|
|
)
|
|
)
|
|
panels.append(
|
|
{
|
|
"id": 3,
|
|
"type": "stat",
|
|
"title": "Mail Bounces (1d)",
|
|
"datasource": PROM_DS,
|
|
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
|
|
"targets": [
|
|
{
|
|
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
|
|
"refId": "A",
|
|
"legendFormat": "Rate",
|
|
},
|
|
{
|
|
"expr": 'max(postmark_outbound_bounced{window="1d"})',
|
|
"refId": "B",
|
|
"legendFormat": "Count",
|
|
},
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": {"mode": "thresholds"},
|
|
"custom": {"displayMode": "auto"},
|
|
"thresholds": bounce_rate_thresholds,
|
|
"unit": "none",
|
|
},
|
|
"overrides": [
|
|
{
|
|
"matcher": {"id": "byName", "options": "Rate"},
|
|
"properties": [{"id": "unit", "value": "percent"}],
|
|
},
|
|
{
|
|
"matcher": {"id": "byName", "options": "Count"},
|
|
"properties": [{"id": "unit", "value": "none"}],
|
|
},
|
|
],
|
|
},
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "area",
|
|
"justifyMode": "center",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
|
"textMode": "name_and_value",
|
|
},
|
|
}
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
4,
|
|
"Success Rate (1d)",
|
|
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
|
|
{"h": 4, "w": 6, "x": 18, "y": 0},
|
|
unit="percent",
|
|
thresholds=success_thresholds,
|
|
decimals=1,
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
stat_panel(
|
|
5,
|
|
"Limit Used (30d)",
|
|
"max(postmark_sending_limit_used_percent)",
|
|
{"h": 4, "w": 6, "x": 0, "y": 4},
|
|
thresholds=limit_thresholds,
|
|
unit="percent",
|
|
decimals=1,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
6,
|
|
"Send Limit (30d)",
|
|
"max(postmark_sending_limit)",
|
|
{"h": 4, "w": 6, "x": 6, "y": 4},
|
|
decimals=0,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
7,
|
|
"Last Success",
|
|
"max(postmark_last_success_timestamp_seconds)",
|
|
{"h": 4, "w": 6, "x": 12, "y": 4},
|
|
unit="dateTimeAsIso",
|
|
decimals=0,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
8,
|
|
"Exporter Errors",
|
|
"sum(postmark_request_errors_total)",
|
|
{"h": 4, "w": 6, "x": 18, "y": 4},
|
|
decimals=0,
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
timeseries_panel(
|
|
13,
|
|
"Bounce Rate (1d vs 7d)",
|
|
"max by (window) (postmark_outbound_bounce_rate)",
|
|
{"h": 8, "w": 12, "x": 0, "y": 12},
|
|
unit="percent",
|
|
legend="{{window}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
14,
|
|
"Bounced (1d vs 7d)",
|
|
"max by (window) (postmark_outbound_bounced)",
|
|
{"h": 8, "w": 12, "x": 12, "y": 12},
|
|
unit="none",
|
|
legend="{{window}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
15,
|
|
"Sent (1d vs 7d)",
|
|
"max by (window) (postmark_outbound_sent)",
|
|
{"h": 8, "w": 12, "x": 0, "y": 20},
|
|
unit="none",
|
|
legend="{{window}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
16,
|
|
"Exporter Errors",
|
|
"sum(postmark_request_errors_total)",
|
|
{"h": 8, "w": 12, "x": 12, "y": 20},
|
|
unit="none",
|
|
)
|
|
)
|
|
|
|
return {
|
|
"uid": "atlas-mail",
|
|
"title": "Atlas Mail",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-30d", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "mail"],
|
|
}
|
|
|
|
|
|
def build_jobs_dashboard():
|
|
panels = []
|
|
suite_var = "${suite:regex}"
|
|
test_var = "${test:regex}"
|
|
branch_var = "${branch:regex}"
|
|
success = PLATFORM_TEST_SUCCESS_STATUS
|
|
exported = PLATFORM_TEST_EXPORT_FILTER
|
|
runs_selector = f'suite=~"{suite_var}",{exported}'
|
|
runs_success_selector = f'{runs_selector},status=~"{success}"'
|
|
runs_failure_selector = f'{runs_selector},status!~"{success}"'
|
|
checks_selector = f'__name__=~".*_quality_gate_checks_total",suite=~"{suite_var}",{exported}'
|
|
coverage_metric_selector = f'__name__=~".*_quality_gate_coverage_percent",suite=~"{suite_var}",{exported}'
|
|
workspace_coverage_selector = f'suite=~"{suite_var}",{exported}'
|
|
smell_selector = f'suite=~"{suite_var}",{exported}'
|
|
build_info_selector = f'suite=~"{suite_var}",branch!="",branch=~"{branch_var}",{exported}'
|
|
selected_suite_universe = (
|
|
f'(count by (suite) (platform_quality_gate_build_info{{{build_info_selector}}}) >= bool 0)'
|
|
)
|
|
selected_suite_zero = f"(0 * ({selected_suite_universe}))"
|
|
selected_suite_missing = f"(({selected_suite_zero}) - 1)"
|
|
|
|
def _latest_check_status_value(
|
|
check_matcher: str = 'check!=""',
|
|
status_matcher: str = 'status!=""',
|
|
) -> str:
|
|
selector = platform_check_rollup_status_expr(
|
|
f'suite=~"{suite_var}"',
|
|
branch_matcher=f'branch!="",branch=~"{branch_var}"',
|
|
check_matcher=check_matcher,
|
|
status_matcher=status_matcher,
|
|
)
|
|
return selector
|
|
|
|
def _latest_check_flags(
|
|
check_matcher: str = 'check!=""',
|
|
status_matcher: str = 'status!=""',
|
|
) -> str:
|
|
values = _latest_check_status_value(check_matcher, status_matcher)
|
|
return f"clamp_max(max by (suite, check) (({values}) > 0), 1)"
|
|
|
|
def _latest_healthy_check_flags(check_matcher: str = 'check!=""') -> str:
|
|
ok_flags = _latest_check_flags(check_matcher, f'status=~"{non_failure}"')
|
|
failed_flags = _latest_check_flags(check_matcher, f'status!~"{non_failure}"')
|
|
return f"(({ok_flags}) unless on(suite, check) ({failed_flags}))"
|
|
|
|
suite_universe = " or ".join(
|
|
f'label_replace(vector(1), "suite", "{suite}", "__name__", ".*")'
|
|
for suite in PLATFORM_TEST_SUITE_NAMES
|
|
)
|
|
|
|
runs_24h_rollup_selector = (
|
|
f'{PLATFORM_TEST_RUNS_24H_ROLLUP}{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}"}}'
|
|
)
|
|
runs_24h_success_rollup_selector = (
|
|
f'{PLATFORM_TEST_RUNS_24H_ROLLUP}{{suite=~"{suite_var}",branch!="",'
|
|
f'branch=~"{branch_var}",status=~"{success}"}}'
|
|
)
|
|
runs_24h_failure_rollup_selector = (
|
|
f'{PLATFORM_TEST_RUNS_24H_ROLLUP}{{suite=~"{suite_var}",branch!="",'
|
|
f'branch=~"{branch_var}",status!~"{success}"}}'
|
|
)
|
|
runs_24h = f'(sum({runs_24h_rollup_selector}) or on() vector(0))'
|
|
runs_30d = f'(sum({platform_runs_increase(runs_selector, "30d", "15m")}) or on() vector(0))'
|
|
success_24h = f'(sum({runs_24h_success_rollup_selector}) or on() vector(0))'
|
|
success_30d = (
|
|
f'(sum({platform_runs_increase(runs_success_selector, "30d", "15m")}) or on() vector(0))'
|
|
)
|
|
failures_24h = f'(sum({runs_24h_failure_rollup_selector}) or on() vector(0))'
|
|
success_rate_24h = f"100 * ({success_24h}) / clamp_min(({runs_24h}), 1)"
|
|
success_rate_30d = f"100 * ({success_30d}) / clamp_min(({runs_30d}), 1)"
|
|
runs_by_suite_24h = f"sum by (suite) ({runs_24h_rollup_selector})"
|
|
success_by_suite_24h = f"sum by (suite) ({runs_24h_success_rollup_selector})"
|
|
success_rate_by_suite_24h = (
|
|
f'((100 * ({success_by_suite_24h}) / clamp_min(({runs_by_suite_24h}), 1)) '
|
|
f'and on(suite) (({runs_by_suite_24h}) > 0)) '
|
|
f'or on(suite) ({selected_suite_missing})'
|
|
)
|
|
non_failure = PLATFORM_TEST_NON_FAILURE_STATUS
|
|
standard_check_matcher = f'check=~"{PLATFORM_TEST_STANDARD_CHECK_REGEX}"'
|
|
current_gate_seen_vector = _latest_check_flags(standard_check_matcher)
|
|
current_gate_ok_vector = _latest_healthy_check_flags(standard_check_matcher)
|
|
current_gate_ok_checks = (
|
|
f"sum by (suite) ({current_gate_ok_vector})"
|
|
)
|
|
current_gate_seen_checks = (
|
|
f"sum by (suite) ({current_gate_seen_vector})"
|
|
)
|
|
current_gate_check_health_by_suite = (
|
|
f"(100 * ({current_gate_ok_checks}) / clamp_min(({current_gate_seen_checks}), 1))"
|
|
)
|
|
current_category_health_by_suite = (
|
|
f'min by (suite) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{suite=~"{suite_var}",'
|
|
f'branch!="",branch=~"{branch_var}",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})'
|
|
)
|
|
current_gate_health_observed_by_suite = (
|
|
f"min by (suite) (({current_gate_check_health_by_suite}) "
|
|
f"or ({current_category_health_by_suite}))"
|
|
)
|
|
current_gate_health_by_suite = (
|
|
f"(({current_gate_health_observed_by_suite}) or on(suite) ({selected_suite_missing}))"
|
|
)
|
|
current_gate_health = (
|
|
f"(avg(({current_gate_health_observed_by_suite})) or on() vector(0))"
|
|
)
|
|
suite_freshness_24h = (
|
|
f"100 * (sum(({runs_by_suite_24h}) > bool 0) or on() vector(0)) "
|
|
f"/ clamp_min(count(({selected_suite_universe})), 1)"
|
|
)
|
|
success_history_runs = f'sum by (suite) ({platform_runs_increase(runs_selector, "7d")})'
|
|
success_history_by_suite = (
|
|
f'(100 * sum by (suite) ({platform_runs_increase(runs_success_selector, "7d")}) '
|
|
f'/ ({success_history_runs})) and on(suite) (({success_history_runs}) > 0)'
|
|
)
|
|
daily_success_volume = (
|
|
f"sum({runs_24h_success_rollup_selector}) or on() vector(0)"
|
|
)
|
|
daily_failure_volume = (
|
|
f"sum({runs_24h_failure_rollup_selector}) or on() vector(0)"
|
|
)
|
|
rollup_selector = f'suite=~"{suite_var}",branch=~"{branch_var}"'
|
|
coverage_by_suite = (
|
|
f"max by (suite) ({PLATFORM_TEST_COVERAGE_ROLLUP}{{{rollup_selector}}})"
|
|
)
|
|
coverage_history_by_suite = coverage_by_suite
|
|
coverage_with_missing = (
|
|
f"({coverage_by_suite}) or on(suite) ({selected_suite_missing})"
|
|
)
|
|
smell_by_suite = (
|
|
f"max by (suite) ({PLATFORM_TEST_SOURCE_LINES_OVER_500_ROLLUP}{{{rollup_selector}}})"
|
|
)
|
|
loc_files_by_suite = (
|
|
f"max by (suite) ({PLATFORM_TEST_SOURCE_FILES_ROLLUP}{{{rollup_selector}}})"
|
|
)
|
|
smell_history_by_suite = smell_by_suite
|
|
loc_files_history_by_suite = loc_files_by_suite
|
|
smell_with_missing = (
|
|
f"({smell_by_suite}) or on(suite) ({selected_suite_missing})"
|
|
)
|
|
loc_limit_compliance_by_suite = (
|
|
f"(100 * clamp_min(({loc_files_by_suite}) - ({smell_by_suite}), 0) / ({loc_files_by_suite})) "
|
|
f"and on(suite) (({loc_files_by_suite}) > 0)"
|
|
)
|
|
loc_limit_compliance_with_missing = (
|
|
f"({loc_limit_compliance_by_suite}) "
|
|
f"or on(suite) (100 * (1 - clamp_max(({smell_by_suite}), 1))) "
|
|
f"or on(suite) ({selected_suite_missing})"
|
|
)
|
|
loc_limit_compliance_history = (
|
|
f"(100 * clamp_min(({loc_files_history_by_suite}) - ({smell_history_by_suite}), 0) / ({loc_files_history_by_suite})) "
|
|
f"and on(suite) (({loc_files_history_by_suite}) > 0) "
|
|
f"or on(suite) (100 * (1 - clamp_max(({smell_history_by_suite}), 1)))"
|
|
)
|
|
average_coverage = f"(avg(({coverage_by_suite})) or on() vector(0))"
|
|
suites_loc_violating = f'(sum((({smell_by_suite}) > bool 0)) or on() vector(0))'
|
|
|
|
check_regex_tests = "tests|unit|build"
|
|
check_regex_coverage = "coverage"
|
|
check_regex_loc = "loc|smell"
|
|
check_regex_style = "docs|naming|hygiene|lint|docs_naming|style"
|
|
check_regex_gate_glue = "gate|glue|gate_glue"
|
|
check_regex_sonarqube = "sonarqube|sonar"
|
|
check_regex_supply_chain = "ironbank|supply_chain|image_compliance|artifact_security"
|
|
|
|
def _check_state_percent_series(regex: str, failed: bool) -> str:
|
|
family_vector = (
|
|
f'platform_quality:check_seen_flag:present_1h{{suite=~"{suite_var}",'
|
|
f'branch!="",branch=~"{branch_var}",check=~"{regex}"}}'
|
|
)
|
|
state_metric = (
|
|
"platform_quality:check_failed_flag:present_1h"
|
|
if failed
|
|
else "platform_quality:check_healthy_flag:present_1h"
|
|
)
|
|
state_vector = (
|
|
f'{state_metric}{{suite=~"{suite_var}",branch!="",'
|
|
f'branch=~"{branch_var}",check=~"{regex}"}}'
|
|
)
|
|
state_checks = f"sum by (suite) ({state_vector})"
|
|
total_checks = f"sum by (suite) ({family_vector})"
|
|
state_percent = f"(100 * ({state_checks}) / clamp_min(({total_checks}), 1))"
|
|
return f"(({state_percent}) or on(suite) ({selected_suite_zero}))"
|
|
|
|
failed_test_status_selector = (
|
|
f'platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",'
|
|
f'branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}'
|
|
)
|
|
rollup_failed_tests = (
|
|
f"sum by (suite, test) (sum_over_time({failed_test_status_selector}[24h:1h]))"
|
|
)
|
|
current_problem_test_candidates = (
|
|
f"sum by (suite, test) (sum_over_time({failed_test_status_selector}[24h:1h] @ end()))"
|
|
)
|
|
problematic_tests_history_core = (
|
|
f"({rollup_failed_tests}) "
|
|
f"and on (suite, test) topk(12, ({current_problem_test_candidates}) >= 2)"
|
|
)
|
|
problematic_tests_history = problematic_tests_history_core
|
|
rollup_failed_tests_30d = (
|
|
f'sum by (suite, test) (sum_over_time(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",branch=~"{branch_var}",test!="",test!="__no_test_cases__",status="failed"}}[30d:1h]))'
|
|
)
|
|
worst_test_per_suite_core = (
|
|
f"topk by (suite) (1, ({rollup_failed_tests_30d}))"
|
|
)
|
|
worst_test_per_suite = worst_test_per_suite_core
|
|
|
|
def _selected_status_volume(status: str) -> str:
|
|
return (
|
|
f'(sum(platform_quality:test_case_status:count_1h{{suite=~"{suite_var}",branch!="",'
|
|
f'branch=~"{branch_var}",test!="",test=~"{test_var}",test!="__no_test_cases__",'
|
|
f'status="{status}"}}) or on() vector(0))'
|
|
)
|
|
|
|
selected_test_pass_fail = [
|
|
{
|
|
"refId": "A",
|
|
"expr": _selected_status_volume("passed"),
|
|
"legendFormat": "Passed",
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"expr": _selected_status_volume("failed"),
|
|
"legendFormat": "Failed",
|
|
},
|
|
{
|
|
"refId": "C",
|
|
"expr": _selected_status_volume("skipped"),
|
|
"legendFormat": "Skipped",
|
|
},
|
|
]
|
|
selected_test_pass_rate = (
|
|
f'avg by (suite) (platform_quality:test_case_pass_rate:percent_1h{{suite=~"{suite_var}",'
|
|
f'branch!="",branch=~"{branch_var}",test!="",test=~"{test_var}",test!="__no_test_cases__"}})'
|
|
)
|
|
category_pass_rate_history = (
|
|
f'avg by (category) ({PLATFORM_TEST_CATEGORY_HEALTH_ROLLUP}{{suite=~"{suite_var}",'
|
|
f'branch!="",branch=~"{branch_var}",category=~"{PLATFORM_TEST_CATEGORY_REGEX}"}})'
|
|
)
|
|
recent_branch_evidence = (
|
|
f'sort_desc(count by (suite, branch) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[30d:15m])))'
|
|
)
|
|
non_primary_branch_evidence = (
|
|
f'count by (suite) (max_over_time(platform_quality_gate_build_info{{{build_info_selector},branch!~"main|master|origin/main|origin/master|unknown"}}[30d:15m]))'
|
|
)
|
|
branch_evidence_by_suite = (
|
|
f'count by (suite) (max_over_time(platform_quality_gate_build_info{{{build_info_selector}}}[30d:15m]))'
|
|
)
|
|
primary_branch_clean_by_suite = (
|
|
f'(100 * ((({branch_evidence_by_suite}) > bool 0) '
|
|
f'unless on(suite) (({non_primary_branch_evidence}) > bool 0))) '
|
|
f'or on(suite) (0 * (({branch_evidence_by_suite}) > bool 0))'
|
|
)
|
|
|
|
def _missing_suite_series(presence_expr: str) -> str:
|
|
missing = f"(({suite_universe}) unless on(suite) {presence_expr})"
|
|
return f"({missing}) or on(suite) (0 * ({suite_universe}))"
|
|
|
|
def _present_suite_percent(presence_expr: str) -> str:
|
|
present = f"(({suite_universe}) and on(suite) {presence_expr})"
|
|
return f"(100 * ({present})) or on(suite) (0 * ({suite_universe}))"
|
|
|
|
present_tests_by_suite = _present_suite_percent(
|
|
f'count by (suite) ({{__name__=~".*_quality_gate_tests_total",{exported}}})'
|
|
)
|
|
present_checks_by_suite = _present_suite_percent(
|
|
f'count by (suite) ({{__name__=~".*_quality_gate_checks_total",{exported}}})'
|
|
)
|
|
present_coverage_by_suite = _present_suite_percent(
|
|
f"count by (suite) (platform_quality_gate_workspace_line_coverage_percent{{{exported}}})"
|
|
)
|
|
present_loc_by_suite = _present_suite_percent(
|
|
f"count by (suite) (platform_quality_gate_source_lines_over_500_total{{{exported}}}) "
|
|
f"and on(suite) count by (suite) (platform_quality_gate_source_files_total{{{exported}}})"
|
|
)
|
|
present_test_case_by_suite = _present_suite_percent(
|
|
f"count by (suite) (platform_quality_gate_test_case_result{{{exported}}})"
|
|
)
|
|
real_test_case_by_suite = _present_suite_percent(
|
|
f'count by (suite) (platform_quality_gate_test_case_result{{{exported},test!="__no_test_cases__"}})'
|
|
)
|
|
|
|
dark_red = "dark-red"
|
|
dark_orange = "dark-orange"
|
|
dark_yellow = "dark-yellow"
|
|
dark_green = "dark-green"
|
|
dark_blue = "dark-blue"
|
|
success_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": dark_red, "value": None},
|
|
{"color": dark_orange, "value": 90},
|
|
{"color": dark_yellow, "value": 93},
|
|
{"color": dark_green, "value": 95},
|
|
{"color": dark_blue, "value": 100},
|
|
],
|
|
}
|
|
coverage_thresholds = success_thresholds
|
|
failures_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": dark_blue, "value": None},
|
|
{"color": dark_green, "value": 0.01},
|
|
{"color": dark_yellow, "value": 1},
|
|
{"color": dark_orange, "value": 3},
|
|
{"color": dark_red, "value": 5},
|
|
],
|
|
}
|
|
problematic_test_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": dark_blue, "value": None},
|
|
{"color": dark_green, "value": 2},
|
|
{"color": dark_yellow, "value": 3},
|
|
{"color": dark_orange, "value": 5},
|
|
{"color": dark_red, "value": 8},
|
|
],
|
|
}
|
|
smell_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": dark_red, "value": None},
|
|
{"color": dark_green, "value": 0},
|
|
{"color": dark_yellow, "value": 1},
|
|
{"color": dark_orange, "value": 3},
|
|
{"color": dark_red, "value": 5},
|
|
],
|
|
}
|
|
missing_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": dark_green, "value": None},
|
|
{"color": dark_red, "value": 1},
|
|
],
|
|
}
|
|
|
|
panels.append(
|
|
stat_panel(
|
|
156,
|
|
"Current Gate Health (%)",
|
|
current_gate_health,
|
|
{"h": 5, "w": 3, "x": 0, "y": 0},
|
|
unit="percent",
|
|
decimals=2,
|
|
instant=True,
|
|
thresholds=success_thresholds,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"CI Run Success Rate (24h)",
|
|
success_rate_24h,
|
|
{"h": 5, "w": 4, "x": 0, "y": 0},
|
|
unit="percent",
|
|
decimals=2,
|
|
instant=True,
|
|
thresholds=success_thresholds,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"CI Run Success Rate (30d)",
|
|
success_rate_30d,
|
|
{"h": 5, "w": 4, "x": 4, "y": 0},
|
|
unit="percent",
|
|
decimals=2,
|
|
instant=True,
|
|
thresholds=success_thresholds,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
4,
|
|
"Failed Runs (24h)",
|
|
failures_24h,
|
|
{"h": 5, "w": 4, "x": 8, "y": 0},
|
|
unit="none",
|
|
instant=True,
|
|
thresholds=failures_thresholds,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
5,
|
|
"CI Runs (24h)",
|
|
runs_24h,
|
|
{"h": 5, "w": 4, "x": 12, "y": 0},
|
|
unit="none",
|
|
instant=True,
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [{"color": dark_red, "value": None}, {"color": dark_green, "value": 1}],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
157,
|
|
"Suite Freshness (24h)",
|
|
suite_freshness_24h,
|
|
{"h": 5, "w": 3, "x": 15, "y": 0},
|
|
unit="percent",
|
|
decimals=0,
|
|
instant=True,
|
|
thresholds=success_thresholds,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
6,
|
|
"Avg Coverage (%)",
|
|
average_coverage,
|
|
{"h": 5, "w": 4, "x": 16, "y": 0},
|
|
unit="percent",
|
|
decimals=2,
|
|
instant=True,
|
|
thresholds=success_thresholds,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
7,
|
|
"Suites with LOC >500",
|
|
suites_loc_violating,
|
|
{"h": 5, "w": 4, "x": 20, "y": 0},
|
|
unit="none",
|
|
instant=True,
|
|
thresholds=smell_thresholds,
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
bargauge_panel(
|
|
8,
|
|
"Latest Gate Health by Suite",
|
|
current_gate_health_by_suite,
|
|
{"h": 8, "w": 8, "x": 0, "y": 5},
|
|
unit="percent",
|
|
instant=True,
|
|
legend="{{suite}}",
|
|
sort_order="asc",
|
|
thresholds=success_thresholds,
|
|
decimals=2,
|
|
)
|
|
)
|
|
panels[-1]["fieldConfig"]["defaults"]["mappings"] = [
|
|
{"type": "value", "options": {"-1": {"text": "missing"}}}
|
|
]
|
|
panels[-1]["description"] = (
|
|
"Current health by suite from required gate checks, capped by category-level test health. "
|
|
"Skipped and not-applicable results are healthy; failures and errors lower the value."
|
|
)
|
|
reliability_suite_panel = bargauge_panel(
|
|
9,
|
|
"CI Run Success by Suite (24h)",
|
|
success_rate_by_suite_24h,
|
|
{"h": 8, "w": 8, "x": 8, "y": 5},
|
|
unit="percent",
|
|
instant=True,
|
|
legend="{{suite}}",
|
|
sort_order="asc",
|
|
thresholds=success_thresholds,
|
|
decimals=2,
|
|
)
|
|
reliability_suite_panel["description"] = (
|
|
"24h CI run success rate. This is whether automation finished cleanly, so it can stay low "
|
|
"after failed or aborted runs even when tests and latest gate checks are green."
|
|
)
|
|
reliability_suite_panel["fieldConfig"]["defaults"]["mappings"] = [
|
|
{"type": "value", "options": {"-1": {"text": "no runs"}}}
|
|
]
|
|
panels.append(reliability_suite_panel)
|
|
history_panel = state_timeline_panel(
|
|
11,
|
|
"CI Run Success by Suite (7d rolling)",
|
|
success_history_by_suite,
|
|
{"h": 8, "w": 24, "x": 0, "y": 13},
|
|
thresholds=success_thresholds,
|
|
description=(
|
|
"Seven-day rolling CI run success rate per suite. Each suite gets its own lane, "
|
|
"so failed or aborted runs lower the lane color without implying raw test failures."
|
|
),
|
|
)
|
|
panels.append(history_panel)
|
|
|
|
run_volume_panel = timeseries_panel(
|
|
12,
|
|
"Daily Run Volume (Selected Scope)",
|
|
None,
|
|
{"h": 8, "w": 8, "x": 0, "y": 21},
|
|
unit="none",
|
|
targets=[
|
|
{"refId": "A", "expr": daily_success_volume, "legendFormat": "Success"},
|
|
{"refId": "B", "expr": daily_failure_volume, "legendFormat": "Failure"},
|
|
],
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
legend_calcs=[],
|
|
)
|
|
run_volume_panel["description"] = (
|
|
"Twenty-four-hour rolling quality-gate run counts for the selected suite/branch scope. "
|
|
"This is volume, not a pass-rate percentage."
|
|
)
|
|
run_volume_panel["fieldConfig"]["defaults"]["min"] = 0
|
|
run_volume_panel["fieldConfig"]["defaults"]["custom"] = {
|
|
"drawStyle": "bars",
|
|
"barAlignment": 0,
|
|
"lineWidth": 0,
|
|
"fillOpacity": 70,
|
|
"stacking": {"mode": "normal", "group": "A"},
|
|
}
|
|
panels.append(run_volume_panel)
|
|
|
|
panels.append(
|
|
state_timeline_panel(
|
|
13,
|
|
"Coverage History by Suite",
|
|
coverage_history_by_suite,
|
|
{"h": 8, "w": 8, "x": 8, "y": 21},
|
|
thresholds=coverage_thresholds,
|
|
description=(
|
|
"Latest reported line coverage per suite over time. Coverage is separate "
|
|
"from LOC compliance so one signal cannot hide the other."
|
|
),
|
|
)
|
|
)
|
|
panels.append(
|
|
state_timeline_panel(
|
|
14,
|
|
"Files <=500 LOC History by Suite",
|
|
loc_limit_compliance_history,
|
|
{"h": 8, "w": 8, "x": 16, "y": 21},
|
|
thresholds=success_thresholds,
|
|
description=(
|
|
"Percent of LOC-gated source files at or under the 500-line limit. "
|
|
"This uses the existing file-count telemetry; longest-file history needs a new publisher metric."
|
|
),
|
|
)
|
|
)
|
|
|
|
check_dimensions = [
|
|
("Tests", check_regex_tests),
|
|
("Coverage", check_regex_coverage),
|
|
("LOC", check_regex_loc),
|
|
("Style", check_regex_style),
|
|
("Gate Glue", check_regex_gate_glue),
|
|
("SonarQube", check_regex_sonarqube),
|
|
("Supply Chain", check_regex_supply_chain),
|
|
]
|
|
|
|
def _append_check_trends(start_id: int, title_prefix: str, failed: bool, y: int) -> None:
|
|
trend_thresholds = failures_thresholds if failed else success_thresholds
|
|
trend_description = (
|
|
"Latest bad-state percentage for this check family, evaluated over time. "
|
|
"Higher means more selected suites/checks are failing in the freshness window; this is not an event-count spike chart."
|
|
if failed
|
|
else "Latest acceptable-state percentage for this check family, evaluated over time. "
|
|
"Higher means more selected suites/checks are healthy in the freshness window; gaps mean there was no check evidence."
|
|
)
|
|
for index, (label, regex) in enumerate(check_dimensions[:4]):
|
|
panel = state_timeline_panel(
|
|
start_id + index,
|
|
f"{label} {title_prefix}",
|
|
_check_state_percent_series(regex, failed),
|
|
{"h": 7, "w": 6, "x": index * 6, "y": y},
|
|
thresholds=trend_thresholds,
|
|
description=trend_description,
|
|
)
|
|
panels.append(panel)
|
|
for index, (label, regex) in enumerate(check_dimensions[4:]):
|
|
panel = state_timeline_panel(
|
|
start_id + 4 + index,
|
|
f"{label} {title_prefix}",
|
|
_check_state_percent_series(regex, failed),
|
|
{"h": 7, "w": 8, "x": index * 8, "y": y + 7},
|
|
thresholds=trend_thresholds,
|
|
description=trend_description,
|
|
)
|
|
panels.append(panel)
|
|
|
|
_append_check_trends(130, "Failure Rate", True, 29)
|
|
_append_check_trends(138, "Healthy Rate", False, 43)
|
|
panels.append(
|
|
state_timeline_panel(
|
|
145,
|
|
"Problematic Tests Over Time (Top failures)",
|
|
problematic_tests_history,
|
|
{"h": 8, "w": 12, "x": 0, "y": 57},
|
|
thresholds=problematic_test_thresholds,
|
|
unit="none",
|
|
min_value=0,
|
|
max_value=None,
|
|
legend="{{suite}} - {{test}}",
|
|
description=(
|
|
"Current outlier tests by rolling 24h failure count. A test needs at least two recent "
|
|
"failures to appear, then falls off once it quiets down."
|
|
),
|
|
)
|
|
)
|
|
panels[-1]["links"] = jenkins_suite_links()
|
|
panels[-1]["fieldConfig"]["defaults"]["links"] = jenkins_latest_artifact_data_links()
|
|
panels.append(
|
|
bargauge_panel(
|
|
147,
|
|
"Most Problematic Test by Suite (30d)",
|
|
worst_test_per_suite,
|
|
{"h": 8, "w": 12, "x": 12, "y": 57},
|
|
unit="none",
|
|
instant=True,
|
|
legend="{{suite}} · {{test}}",
|
|
sort_order="desc",
|
|
thresholds=problematic_test_thresholds,
|
|
limit=9,
|
|
links=jenkins_suite_links(),
|
|
data_links=jenkins_latest_artifact_data_links(),
|
|
)
|
|
)
|
|
panels[-1]["description"] = (
|
|
"Worst test per suite summed across 30d. This catches historical repeat offenders even when the "
|
|
"current hourly top list is quiet."
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
146,
|
|
"Selected Test Pass/Fail History",
|
|
None,
|
|
{"h": 8, "w": 12, "x": 0, "y": 65},
|
|
unit="none",
|
|
targets=selected_test_pass_fail,
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
legend_calcs=[],
|
|
links=jenkins_suite_links(),
|
|
data_links=jenkins_artifact_data_links(),
|
|
)
|
|
)
|
|
panels[-1]["description"] = (
|
|
"Stacked hourly outcome volume for the selected suite/branch/test scope. "
|
|
"This uses vmalert rollups only, avoiding expensive raw 30-day per-test scans."
|
|
)
|
|
panels[-1]["fieldConfig"]["defaults"]["min"] = 0
|
|
panels[-1]["fieldConfig"]["defaults"]["custom"] = {
|
|
"drawStyle": "bars",
|
|
"barAlignment": 0,
|
|
"lineWidth": 0,
|
|
"fillOpacity": 70,
|
|
"stacking": {"mode": "normal", "group": "A"},
|
|
}
|
|
selected_pass_rate_panel = state_timeline_panel(
|
|
152,
|
|
"Selected Test Pass Rate History",
|
|
selected_test_pass_rate,
|
|
{"h": 8, "w": 12, "x": 12, "y": 65},
|
|
thresholds=success_thresholds,
|
|
legend="{{suite}}",
|
|
description=(
|
|
"Average pass rate per suite for the selected test filter, using memoized hourly "
|
|
"test-case pass-rate rollups instead of raw historical scans."
|
|
),
|
|
)
|
|
selected_pass_rate_panel["links"] = jenkins_suite_links()
|
|
selected_pass_rate_panel["fieldConfig"]["defaults"]["links"] = jenkins_artifact_data_links()
|
|
panels.append(selected_pass_rate_panel)
|
|
category_pass_rate_panel = state_timeline_panel(
|
|
153,
|
|
"Test Category Health History",
|
|
category_pass_rate_history,
|
|
{"h": 8, "w": 12, "x": 12, "y": 21},
|
|
thresholds=success_thresholds,
|
|
legend="{{category}}",
|
|
description=(
|
|
"Health by test category from memoized hourly rollups. Use the Suite filter to focus one "
|
|
"project; skipped tests are healthy, while failures and errors lower the lane."
|
|
),
|
|
)
|
|
category_pass_rate_panel["links"] = jenkins_suite_links()
|
|
panels.append(category_pass_rate_panel)
|
|
|
|
coverage_panel = bargauge_panel(
|
|
17,
|
|
"Coverage by Suite (Latest, gate 95)",
|
|
coverage_with_missing,
|
|
{"h": 8, "w": 12, "x": 0, "y": 73},
|
|
unit="percent",
|
|
instant=True,
|
|
legend="{{suite}}",
|
|
sort_order="asc",
|
|
thresholds=coverage_thresholds,
|
|
decimals=2,
|
|
)
|
|
coverage_panel["fieldConfig"]["defaults"]["mappings"] = [
|
|
{"type": "value", "options": {"-1": {"text": "missing"}}}
|
|
]
|
|
panels.append(coverage_panel)
|
|
|
|
smell_panel = bargauge_panel(
|
|
18,
|
|
"Files <=500 LOC by Suite (Latest)",
|
|
loc_limit_compliance_with_missing,
|
|
{"h": 8, "w": 12, "x": 12, "y": 73},
|
|
unit="percent",
|
|
instant=True,
|
|
legend="{{suite}}",
|
|
sort_order="asc",
|
|
thresholds=success_thresholds,
|
|
decimals=0,
|
|
)
|
|
smell_panel["fieldConfig"]["defaults"]["mappings"] = [
|
|
{"type": "value", "options": {"-1": {"text": "missing"}}}
|
|
]
|
|
smell_panel["description"] = "Percent of managed LOC-gated files at or under 500 lines. Older suite payloads fall back to 100%/0% until they emit platform_quality_gate_source_files_total."
|
|
panels.append(smell_panel)
|
|
|
|
panels.append(
|
|
bargauge_panel(
|
|
27,
|
|
"Tests Metrics Present by Suite",
|
|
present_tests_by_suite,
|
|
{"h": 7, "w": 6, "x": 0, "y": 81},
|
|
unit="percent",
|
|
instant=True,
|
|
legend="{{suite}}",
|
|
sort_order="asc",
|
|
thresholds=success_thresholds,
|
|
decimals=0,
|
|
)
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
28,
|
|
"Checks Metrics Present by Suite",
|
|
present_checks_by_suite,
|
|
{"h": 7, "w": 6, "x": 6, "y": 81},
|
|
unit="percent",
|
|
instant=True,
|
|
legend="{{suite}}",
|
|
sort_order="asc",
|
|
thresholds=success_thresholds,
|
|
decimals=0,
|
|
)
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
29,
|
|
"Coverage Metrics Present by Suite",
|
|
present_coverage_by_suite,
|
|
{"h": 7, "w": 6, "x": 12, "y": 81},
|
|
unit="percent",
|
|
instant=True,
|
|
legend="{{suite}}",
|
|
sort_order="asc",
|
|
thresholds=success_thresholds,
|
|
decimals=0,
|
|
)
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
30,
|
|
"LOC Compliance Metrics Present by Suite",
|
|
present_loc_by_suite,
|
|
{"h": 7, "w": 6, "x": 18, "y": 81},
|
|
unit="percent",
|
|
instant=True,
|
|
legend="{{suite}}",
|
|
sort_order="asc",
|
|
thresholds=success_thresholds,
|
|
decimals=0,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
31,
|
|
"SonarQube API Up",
|
|
"(max(sonarqube_up) or on() vector(0))",
|
|
{"h": 6, "w": 4, "x": 0, "y": 88},
|
|
unit="none",
|
|
instant=True,
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": dark_red, "value": None},
|
|
{"color": dark_green, "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
32,
|
|
"Sonar Projects (Selected)",
|
|
f'(count(max by (project_key) (sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}"}})) or on() vector(0))',
|
|
{"h": 6, "w": 4, "x": 4, "y": 88},
|
|
unit="none",
|
|
instant=True,
|
|
thresholds=failures_thresholds,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
33,
|
|
"Sonar Gate Fetch Errors",
|
|
"(max(sonarqube_quality_gate_fetch_errors_total) or on() vector(0))",
|
|
{"h": 6, "w": 4, "x": 8, "y": 88},
|
|
unit="none",
|
|
instant=True,
|
|
thresholds=failures_thresholds,
|
|
)
|
|
)
|
|
sonar_status_mix_panel = pie_panel(
|
|
34,
|
|
"Sonar Gate Status Mix (Selected)",
|
|
f'count by (status) (max by (project_key, status) (sonarqube_project_quality_gate_pass{{project_key=~"{suite_var}"}}))',
|
|
{"h": 6, "w": 4, "x": 12, "y": 88},
|
|
)
|
|
sonar_status_mix_panel["targets"][0]["legendFormat"] = "{{status}}"
|
|
panels.append(sonar_status_mix_panel)
|
|
panels.append(
|
|
state_timeline_panel(
|
|
35,
|
|
"Sonar Gate Health by Project",
|
|
f'{PLATFORM_TEST_SONAR_HEALTH_ROLLUP}{{project_key=~"{suite_var}"}}',
|
|
{"h": 6, "w": 8, "x": 16, "y": 88},
|
|
thresholds=success_thresholds,
|
|
unit="percent",
|
|
min_value=0,
|
|
max_value=100,
|
|
legend="{{project_key}}",
|
|
description=(
|
|
"SonarQube gate status over time by project. OK projects render as full healthy lanes; "
|
|
"non-OK projects drop to red without disappearing."
|
|
),
|
|
)
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
148,
|
|
"Test-Case Metrics Present by Suite",
|
|
present_test_case_by_suite,
|
|
{"h": 6, "w": 12, "x": 0, "y": 94},
|
|
unit="percent",
|
|
instant=True,
|
|
legend="{{suite}}",
|
|
sort_order="asc",
|
|
thresholds=success_thresholds,
|
|
decimals=0,
|
|
)
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
151,
|
|
"Real Test Cases Present by Suite",
|
|
real_test_case_by_suite,
|
|
{"h": 6, "w": 12, "x": 12, "y": 94},
|
|
unit="percent",
|
|
instant=True,
|
|
legend="{{suite}}",
|
|
sort_order="asc",
|
|
thresholds=success_thresholds,
|
|
decimals=0,
|
|
)
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
149,
|
|
"Recent Branch Evidence by Suite (30d)",
|
|
recent_branch_evidence,
|
|
{"h": 7, "w": 12, "x": 0, "y": 100},
|
|
unit="none",
|
|
instant=True,
|
|
legend="{{suite}} · {{branch}}",
|
|
sort_order="desc",
|
|
thresholds=missing_thresholds,
|
|
decimals=0,
|
|
links=jenkins_suite_links(),
|
|
)
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
150,
|
|
"Primary Branch Clean by Suite (30d)",
|
|
primary_branch_clean_by_suite,
|
|
{"h": 7, "w": 12, "x": 12, "y": 100},
|
|
unit="percent",
|
|
instant=True,
|
|
legend="{{suite}}",
|
|
sort_order="asc",
|
|
thresholds=success_thresholds,
|
|
decimals=0,
|
|
links=jenkins_suite_links(),
|
|
)
|
|
)
|
|
|
|
# Keep the first paint intentionally light. The detailed matrices remain
|
|
# available, but they stay collapsed so browsers do not render every series
|
|
# and legend before the operator asks for them.
|
|
panel_by_id = {panel["id"]: panel for panel in panels}
|
|
visible_layout = {
|
|
156: {"h": 4, "w": 3, "x": 0, "y": 0},
|
|
2: {"h": 4, "w": 3, "x": 3, "y": 0},
|
|
3: {"h": 4, "w": 3, "x": 6, "y": 0},
|
|
4: {"h": 4, "w": 3, "x": 9, "y": 0},
|
|
5: {"h": 4, "w": 3, "x": 12, "y": 0},
|
|
157: {"h": 4, "w": 3, "x": 15, "y": 0},
|
|
6: {"h": 4, "w": 3, "x": 18, "y": 0},
|
|
7: {"h": 4, "w": 3, "x": 21, "y": 0},
|
|
8: {"h": 7, "w": 6, "x": 0, "y": 4},
|
|
9: {"h": 7, "w": 6, "x": 6, "y": 4},
|
|
17: {"h": 7, "w": 6, "x": 12, "y": 4},
|
|
18: {"h": 7, "w": 6, "x": 18, "y": 4},
|
|
}
|
|
compact_panels = []
|
|
for panel_id, grid in visible_layout.items():
|
|
panel = panel_by_id[panel_id]
|
|
panel["gridPos"] = grid
|
|
compact_panels.append(panel)
|
|
|
|
def children(ids):
|
|
return [panel_by_id[panel_id] for panel_id in ids]
|
|
|
|
row_layout = {
|
|
11: {"h": 8, "w": 12, "x": 0, "y": 19},
|
|
153: {"h": 8, "w": 12, "x": 12, "y": 19},
|
|
12: {"h": 8, "w": 8, "x": 0, "y": 27},
|
|
13: {"h": 8, "w": 8, "x": 8, "y": 27},
|
|
14: {"h": 8, "w": 8, "x": 16, "y": 27},
|
|
145: {"h": 8, "w": 24, "x": 0, "y": 74},
|
|
147: {"h": 8, "w": 8, "x": 0, "y": 83},
|
|
146: {"h": 8, "w": 8, "x": 8, "y": 83},
|
|
152: {"h": 8, "w": 8, "x": 16, "y": 83},
|
|
27: {"h": 7, "w": 6, "x": 0, "y": 94},
|
|
28: {"h": 7, "w": 6, "x": 6, "y": 94},
|
|
29: {"h": 7, "w": 6, "x": 12, "y": 94},
|
|
30: {"h": 7, "w": 6, "x": 18, "y": 94},
|
|
148: {"h": 7, "w": 6, "x": 0, "y": 101},
|
|
151: {"h": 7, "w": 6, "x": 6, "y": 101},
|
|
150: {"h": 7, "w": 6, "x": 12, "y": 101},
|
|
149: {"h": 7, "w": 6, "x": 18, "y": 101},
|
|
31: {"h": 6, "w": 4, "x": 0, "y": 111},
|
|
32: {"h": 6, "w": 4, "x": 4, "y": 111},
|
|
33: {"h": 6, "w": 4, "x": 8, "y": 111},
|
|
34: {"h": 6, "w": 4, "x": 12, "y": 111},
|
|
35: {"h": 6, "w": 8, "x": 16, "y": 111},
|
|
}
|
|
for panel_id, grid in row_layout.items():
|
|
panel_by_id[panel_id]["gridPos"] = grid
|
|
|
|
compact_panels.extend(
|
|
[
|
|
row_panel(500, "CI Runs And Test Result History", 11, panels=children([11, 153, 12, 13, 14])),
|
|
row_panel(
|
|
501,
|
|
"Check Failure Rates By Suite",
|
|
12,
|
|
panels=children([130, 131, 132, 133, 134, 135, 136]),
|
|
),
|
|
row_panel(
|
|
502,
|
|
"Check Healthy Rates By Suite",
|
|
13,
|
|
panels=children([138, 139, 140, 141, 142, 143, 144]),
|
|
),
|
|
row_panel(
|
|
503,
|
|
"Test Drilldowns And Problem Tests",
|
|
14,
|
|
panels=children([145, 147, 146, 152]),
|
|
),
|
|
row_panel(
|
|
504,
|
|
"Telemetry Completeness And Branches",
|
|
15,
|
|
panels=children([27, 28, 29, 30, 148, 151, 150, 149]),
|
|
),
|
|
row_panel(
|
|
505,
|
|
"SonarQube Project Health",
|
|
16,
|
|
panels=children([31, 32, 33, 34, 35]),
|
|
),
|
|
]
|
|
)
|
|
panels = compact_panels
|
|
set_bargauge_display_mode(panels, "basic")
|
|
apply_panel_descriptions(panels, TESTING_PANEL_DESCRIPTIONS)
|
|
|
|
return {
|
|
"uid": "atlas-jobs",
|
|
"title": "Atlas Testing",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-30d", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "testing", "quality-gate", "ci"],
|
|
"templating": {
|
|
"list": [
|
|
testing_suite_variable(),
|
|
testing_branch_variable(),
|
|
testing_case_variable(),
|
|
jenkins_base_variable(),
|
|
]
|
|
},
|
|
}
|
|
|
|
|
|
def build_testing_dashboard():
|
|
dashboard = build_jobs_dashboard()
|
|
dashboard["uid"] = "atlas-testing"
|
|
dashboard["folderUid"] = PUBLIC_DASHBOARD_FOLDER
|
|
dashboard["editable"] = False
|
|
return dashboard
|
|
|
|
|
|
def build_gitops_dashboard():
|
|
gitops_success_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "yellow", "value": 99},
|
|
{"color": "blue", "value": 100},
|
|
],
|
|
}
|
|
gitops_value_overrides = [
|
|
{
|
|
"matcher": {"id": "byName", "options": "Value"},
|
|
"properties": [
|
|
{
|
|
"id": "thresholds",
|
|
"value": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "blue", "value": 1},
|
|
],
|
|
},
|
|
}
|
|
],
|
|
}
|
|
]
|
|
kustomization_table = (
|
|
f"max by (namespace, name, path, source_namespace, source_name, revision, ready, reason) "
|
|
f"(ananke_gitops_kustomization_info{{{GITOPS_SELECTOR}}}) "
|
|
f"* on(namespace, name) group_left() max by (namespace, name) "
|
|
f"(ananke_gitops_kustomization_ready{{{GITOPS_SELECTOR}}})"
|
|
)
|
|
helm_table = (
|
|
f"max by (namespace, name, chart, version, app_version, revision, ready, reason) "
|
|
f"(ananke_gitops_helmrelease_info{{{GITOPS_SELECTOR}}}) "
|
|
f"* on(namespace, name) group_left() max by (namespace, name) "
|
|
f"(ananke_gitops_helmrelease_ready{{{GITOPS_SELECTOR}}})"
|
|
)
|
|
source_table = (
|
|
f"max by (namespace, name, url, branch, revision, ready, reason) "
|
|
f"(ananke_gitops_flux_source_info{{{GITOPS_SELECTOR}}}) "
|
|
f"* on(namespace, name) group_left() max by (namespace, name) "
|
|
f"(ananke_gitops_flux_source_ready{{{GITOPS_SELECTOR}}})"
|
|
)
|
|
|
|
panels = [
|
|
stat_panel(
|
|
1,
|
|
"Flux Source",
|
|
f"{GITOPS_SOURCE_INFO} or on() vector(0)",
|
|
{"h": 4, "w": 8, "x": 0, "y": 0},
|
|
text_mode="name",
|
|
targets=[{"expr": f"{GITOPS_SOURCE_INFO} or on() vector(0)", "refId": "A", "legendFormat": "{{branch}} · {{revision}}", "instant": True}],
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "blue", "value": 1},
|
|
],
|
|
},
|
|
description="Branch and revision currently reported by Flux's GitRepository source.",
|
|
),
|
|
stat_panel(
|
|
2,
|
|
"Kustomizations Ready",
|
|
GITOPS_KUSTOMIZATION_READY_PCT,
|
|
{"h": 4, "w": 4, "x": 8, "y": 0},
|
|
unit="percent",
|
|
decimals=1,
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "yellow", "value": 99},
|
|
{"color": "blue", "value": 100},
|
|
],
|
|
},
|
|
),
|
|
stat_panel(
|
|
3,
|
|
"Kustomizations Suspended",
|
|
GITOPS_KUSTOMIZATION_SUSPENDED,
|
|
{"h": 4, "w": 4, "x": 12, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "blue", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
),
|
|
stat_panel(
|
|
4,
|
|
"HelmReleases Ready",
|
|
GITOPS_HELM_READY_PCT,
|
|
{"h": 4, "w": 4, "x": 16, "y": 0},
|
|
unit="percent",
|
|
decimals=1,
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "yellow", "value": 99},
|
|
{"color": "blue", "value": 100},
|
|
],
|
|
},
|
|
),
|
|
stat_panel(
|
|
5,
|
|
"HelmReleases Suspended",
|
|
GITOPS_HELM_SUSPENDED,
|
|
{"h": 4, "w": 4, "x": 20, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "blue", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
),
|
|
stat_panel(
|
|
6,
|
|
"GitOps Exporter",
|
|
None,
|
|
{"h": 4, "w": 8, "x": 0, "y": 4},
|
|
text_mode="name_and_value",
|
|
targets=[
|
|
{"expr": GITOPS_SCRAPE_SUCCESS, "refId": "A", "legendFormat": "Scrape Success", "instant": True},
|
|
{"expr": GITOPS_LAST_SCRAPE_AGE, "refId": "B", "legendFormat": "Sample Age", "instant": True},
|
|
],
|
|
field_overrides=[
|
|
{"matcher": {"id": "byName", "options": "Sample Age"}, "properties": [{"id": "unit", "value": "s"}]},
|
|
{
|
|
"matcher": {"id": "byName", "options": "Scrape Success"},
|
|
"properties": [
|
|
{
|
|
"id": "thresholds",
|
|
"value": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "blue", "value": 1},
|
|
],
|
|
},
|
|
}
|
|
],
|
|
},
|
|
],
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "blue", "value": 1},
|
|
],
|
|
},
|
|
),
|
|
state_timeline_panel(
|
|
7,
|
|
"Readiness History",
|
|
(
|
|
f'label_replace({GITOPS_KUSTOMIZATION_READY_PCT}, "kind", "Kustomizations", "__name__", ".*") '
|
|
f'or label_replace({GITOPS_HELM_READY_PCT}, "kind", "HelmReleases", "__name__", ".*")'
|
|
),
|
|
{"h": 4, "w": 16, "x": 8, "y": 4},
|
|
thresholds=gitops_success_thresholds,
|
|
legend="{{kind}}",
|
|
description="Ready percentage over time for Flux Kustomizations and HelmReleases.",
|
|
),
|
|
table_panel(
|
|
8,
|
|
"Flux Sources",
|
|
source_table,
|
|
{"h": 8, "w": 24, "x": 0, "y": 8},
|
|
instant=True,
|
|
format="table",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
field_overrides=gitops_value_overrides,
|
|
description="A Value of 1 means Ready; 0 means not Ready.",
|
|
),
|
|
table_panel(
|
|
9,
|
|
"Kustomizations",
|
|
kustomization_table,
|
|
{"h": 12, "w": 24, "x": 0, "y": 16},
|
|
instant=True,
|
|
format="table",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
field_overrides=gitops_value_overrides,
|
|
description="A Value of 1 means Ready; 0 means not Ready. The ready/reason labels come from Flux status.conditions.",
|
|
),
|
|
table_panel(
|
|
10,
|
|
"HelmReleases",
|
|
helm_table,
|
|
{"h": 12, "w": 24, "x": 0, "y": 28},
|
|
instant=True,
|
|
format="table",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
field_overrides=gitops_value_overrides,
|
|
description="A Value of 1 means Ready; 0 means not Ready. Chart/version/app_version are included when Flux reports them.",
|
|
),
|
|
]
|
|
return {
|
|
"uid": "atlas-gitops",
|
|
"title": "Atlas GitOps",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "gitops", "flux"],
|
|
}
|
|
|
|
|
|
def build_power_dashboard():
|
|
panels = []
|
|
status_mapping = [
|
|
{
|
|
"type": "value",
|
|
"options": {
|
|
"0": {"text": "⚡ Charging"},
|
|
"1": {"text": "🔋 Discharging"},
|
|
},
|
|
}
|
|
]
|
|
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"UPS Current Load",
|
|
None,
|
|
{"h": 8, "w": 12, "x": 0, "y": 0},
|
|
unit="none",
|
|
decimals=1,
|
|
text_mode="name_and_value",
|
|
targets=[
|
|
{"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Draw (W)", "instant": True},
|
|
{"refId": "B", "expr": ANANKE_UPS_RUNTIME_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Discharge", "instant": True},
|
|
{"refId": "C", "expr": ANANKE_UPS_ON_BATTERY_DB, "legendFormat": f"{ANANKE_UPS_DB_NAME} Status", "instant": True},
|
|
{"refId": "D", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Draw (W)", "instant": True},
|
|
{"refId": "E", "expr": ANANKE_UPS_RUNTIME_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Discharge", "instant": True},
|
|
{"refId": "F", "expr": ANANKE_UPS_ON_BATTERY_TETHYS, "legendFormat": f"{ANANKE_UPS_TETHYS_NAME} Status", "instant": True},
|
|
],
|
|
field_overrides=[
|
|
{
|
|
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Draw (W)"},
|
|
"properties": [{"id": "unit", "value": "watt"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}],
|
|
},
|
|
{
|
|
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Draw (W)"},
|
|
"properties": [{"id": "unit", "value": "watt"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}],
|
|
},
|
|
{
|
|
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Discharge"},
|
|
"properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}],
|
|
},
|
|
{
|
|
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Discharge"},
|
|
"properties": [{"id": "unit", "value": "s"}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}],
|
|
},
|
|
{
|
|
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_DB_NAME} Status"},
|
|
"properties": [{"id": "mappings", "value": status_mapping}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_DB_NODE}"}],
|
|
},
|
|
{
|
|
"matcher": {"id": "byName", "options": f"{ANANKE_UPS_TETHYS_NAME} Status"},
|
|
"properties": [{"id": "mappings", "value": status_mapping}, {"id": "description", "value": f"Attached node: {ANANKE_UPS_TETHYS_NODE}"}],
|
|
},
|
|
],
|
|
orientation="horizontal",
|
|
wide_layout=True,
|
|
description=(
|
|
"Per-UPS live snapshot: current draw in watts, estimated battery runtime if discharge started now, and charging/discharging status."
|
|
),
|
|
)
|
|
)
|
|
panels.append(
|
|
apply_bar_timeseries_style(
|
|
timeseries_panel(
|
|
2,
|
|
"UPS History (Power Draw)",
|
|
None,
|
|
{"h": 8, "w": 12, "x": 12, "y": 0},
|
|
unit="watt",
|
|
targets=[
|
|
{"refId": "A", "expr": ANANKE_UPS_DRAW_WATTS_DB_SERIES, "legendFormat": ANANKE_UPS_DB_NAME},
|
|
{"refId": "B", "expr": ANANKE_UPS_DRAW_WATTS_TETHYS_SERIES, "legendFormat": ANANKE_UPS_TETHYS_NAME},
|
|
],
|
|
field_overrides=fixed_color_overrides(
|
|
{ANANKE_UPS_DB_NAME: "dark-blue", ANANKE_UPS_TETHYS_NAME: "dark-yellow"}
|
|
),
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
description="Historical UPS power consumption in watts for titan-db and tethys.",
|
|
),
|
|
stacked=False,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Current Climate",
|
|
None,
|
|
{"h": 8, "w": 12, "x": 0, "y": 8},
|
|
unit="none",
|
|
decimals=2,
|
|
text_mode="name_and_value",
|
|
targets=[
|
|
{"refId": "A", "expr": CLIMATE_TEMP_MAX, "legendFormat": "Tent Temp (°C)", "instant": True},
|
|
{"refId": "B", "expr": CLIMATE_PRESSURE_CURRENT, "legendFormat": "Tent VPD (kPa)", "instant": True},
|
|
{"refId": "C", "expr": CLIMATE_HUMIDITY_MAX, "legendFormat": "Tent RH (%)", "instant": True},
|
|
{"refId": "D", "expr": CLIMATE_DEWPOINT_CURRENT, "legendFormat": "Dew Point (°C)", "instant": True},
|
|
],
|
|
field_overrides=[
|
|
{"matcher": {"id": "byName", "options": "Tent Temp (°C)"}, "properties": [{"id": "unit", "value": "celsius"}]},
|
|
{"matcher": {"id": "byName", "options": "Tent VPD (kPa)"}, "properties": [{"id": "unit", "value": "suffix:kPa"}]},
|
|
{"matcher": {"id": "byName", "options": "Tent RH (%)"}, "properties": [{"id": "unit", "value": "percent"}]},
|
|
{"matcher": {"id": "byName", "options": "Dew Point (°C)"}, "properties": [{"id": "unit", "value": "celsius"}]},
|
|
],
|
|
orientation="horizontal",
|
|
wide_layout=True,
|
|
description="Current tent temperature, humidity, VPD, and dew point. These render once Typhon climate telemetry is online.",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
4,
|
|
"Climate History",
|
|
None,
|
|
{"h": 8, "w": 12, "x": 12, "y": 8},
|
|
unit="celsius",
|
|
targets=[
|
|
{"refId": "A", "expr": CLIMATE_TEMP_SERIES, "legendFormat": "Temperature (°C)"},
|
|
{"refId": "B", "expr": CLIMATE_HUMIDITY_SERIES, "legendFormat": "Humidity (%)"},
|
|
{"refId": "C", "expr": CLIMATE_PRESSURE_SERIES, "legendFormat": "VPD (kPa)"},
|
|
{"refId": "D", "expr": CLIMATE_DEWPOINT_SERIES, "legendFormat": "Dew Point (°C)"},
|
|
],
|
|
field_overrides=[
|
|
{
|
|
"matcher": {"id": "byName", "options": "Humidity (%)"},
|
|
"properties": [
|
|
{"id": "unit", "value": "percent"},
|
|
],
|
|
},
|
|
{
|
|
"matcher": {"id": "byName", "options": "VPD (kPa)"},
|
|
"properties": [
|
|
{"id": "unit", "value": "none"},
|
|
{"id": "custom.axisPlacement", "value": "right"},
|
|
{"id": "custom.axisLabel", "value": "kPa"},
|
|
{"id": "decimals", "value": 2},
|
|
],
|
|
}
|
|
],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
description="Two-axis chart: tent temperature/humidity/dew point (left axis) and tent VPD in kPa (right axis).",
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
5,
|
|
"Fan Activity",
|
|
None,
|
|
{"h": 8, "w": 12, "x": 0, "y": 16},
|
|
unit="none",
|
|
decimals=0,
|
|
text_mode="name_and_value",
|
|
targets=[
|
|
{"refId": "A", "expr": f"round({CLIMATE_FAN_OUTLET_CURRENT})", "legendFormat": "Inside Outlet", "instant": True},
|
|
{"refId": "B", "expr": f"round({CLIMATE_FAN_INSIDE_INLET_CURRENT})", "legendFormat": "Inside Inlet", "instant": True},
|
|
{"refId": "C", "expr": f"round({CLIMATE_FAN_OUTSIDE_INLET_CURRENT})", "legendFormat": "Outside Inlet", "instant": True},
|
|
{"refId": "D", "expr": f"round({CLIMATE_FAN_INTERIOR_CURRENT})", "legendFormat": "Interior Fans", "instant": True},
|
|
],
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 7},
|
|
{"color": "red", "value": 9},
|
|
],
|
|
},
|
|
orientation="horizontal",
|
|
wide_layout=True,
|
|
description="Current fan activity levels (0-10): inside outlet, inside inlet, outside inlet, and interior fans.",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
6,
|
|
"Fan Intensity History",
|
|
None,
|
|
{"h": 8, "w": 12, "x": 12, "y": 16},
|
|
unit="none",
|
|
max_value=10,
|
|
targets=[
|
|
{"refId": "A", "expr": CLIMATE_FAN_OUTLET_SERIES, "legendFormat": "Inside Outlet"},
|
|
{"refId": "B", "expr": CLIMATE_FAN_INSIDE_INLET_SERIES, "legendFormat": "Inside Inlet"},
|
|
{"refId": "C", "expr": CLIMATE_FAN_OUTSIDE_INLET_SERIES, "legendFormat": "Outside Inlet"},
|
|
{"refId": "D", "expr": CLIMATE_FAN_INTERIOR_SERIES, "legendFormat": "Interior Fans"},
|
|
],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
description="Historical fan activity for all four fan groups (0-10 scale).",
|
|
)
|
|
)
|
|
|
|
return {
|
|
"uid": "atlas-power",
|
|
"title": "Atlas Power",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-24h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "power", "climate"],
|
|
}
|
|
|
|
|
|
def build_gpu_dashboard():
|
|
panels = []
|
|
gpu_scope = "$namespace_scope_gpu"
|
|
panels.append(
|
|
pie_panel(
|
|
1,
|
|
"Namespace GPU Utilization",
|
|
namespace_gpu_share_expr(gpu_scope),
|
|
{"h": 8, "w": 12, "x": 0, "y": 0},
|
|
links=namespace_scope_links("namespace_scope_gpu"),
|
|
description="Shares are based on measured DCGM GPU utilization. If multiple namespaces share one physical GPU and DCGM cannot attribute work safely, activity is grouped as shared. Idle appears only when utilization is zero.",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
2,
|
|
"GPU Activity by Reservation",
|
|
namespace_gpu_usage_instant(gpu_scope),
|
|
{"h": 8, "w": 12, "x": 12, "y": 0},
|
|
unit="percent",
|
|
legend="{{namespace}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
description="Node/device GPU activity attributed by each namespace's GPU reservation on that node.",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
3,
|
|
"GPU Util by Node",
|
|
gpu_util_by_hostname(),
|
|
{"h": 8, "w": 12, "x": 0, "y": 8},
|
|
unit="percent",
|
|
legend="{{Hostname}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
4,
|
|
"GPU Pods Reporting Device Util",
|
|
'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
|
|
{"h": 8, "w": 12, "x": 12, "y": 8},
|
|
unit="percent",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
description="DCGM labels the device utilization sample with GPU-consuming pods; shared-GPU pods can report the same device value.",
|
|
)
|
|
)
|
|
return {
|
|
"uid": "atlas-gpu",
|
|
"title": "Atlas GPU",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "gpu"],
|
|
"templating": {
|
|
"list": [
|
|
namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"),
|
|
namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"),
|
|
namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"),
|
|
]
|
|
},
|
|
}
|
|
|
|
|
|
DASHBOARDS = {
|
|
"atlas-overview": {
|
|
"builder": build_overview,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml",
|
|
},
|
|
"atlas-pods": {
|
|
"builder": build_pods_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml",
|
|
},
|
|
"atlas-nodes": {
|
|
"builder": build_nodes_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml",
|
|
},
|
|
"atlas-storage": {
|
|
"builder": build_storage_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml",
|
|
},
|
|
"atlas-network": {
|
|
"builder": build_network_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
|
|
},
|
|
"atlas-mail": {
|
|
"builder": build_mail_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml",
|
|
},
|
|
"atlas-testing": {
|
|
"builder": build_testing_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml",
|
|
},
|
|
"atlas-gitops": {
|
|
"builder": build_gitops_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gitops.yaml",
|
|
},
|
|
"atlas-power": {
|
|
"builder": build_power_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-power.yaml",
|
|
},
|
|
"atlas-gpu": {
|
|
"builder": build_gpu_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
|
|
},
|
|
}
|
|
|
|
|
|
def write_json(uid, data):
|
|
DASHBOARD_DIR.mkdir(parents=True, exist_ok=True)
|
|
path = DASHBOARD_DIR / f"{uid}.json"
|
|
data = apply_global_status_palette(data)
|
|
path.write_text(json.dumps(data, indent=2) + "\n")
|
|
|
|
|
|
def render_configmap(uid, info):
|
|
json_path = DASHBOARD_DIR / f"{uid}.json"
|
|
payload = json.dumps(apply_global_status_palette(json.loads(json_path.read_text())), indent=2)
|
|
indented = "\n".join(" " + line for line in payload.splitlines())
|
|
output_path = info["configmap"]
|
|
content = CONFIG_TEMPLATE.format(
|
|
relative_path=output_path.relative_to(ROOT),
|
|
name=output_path.stem,
|
|
key=json_path.name,
|
|
payload=indented,
|
|
)
|
|
output_path.write_text(content)
|
|
print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders")
|
|
args = parser.parse_args()
|
|
|
|
if args.build:
|
|
for uid, info in DASHBOARDS.items():
|
|
write_json(uid, info["builder"]())
|
|
|
|
for uid, info in DASHBOARDS.items():
|
|
render_configmap(uid, info)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|