#!/usr/bin/env python3 """Generate Atlas Grafana dashboards and render them into ConfigMaps. Usage: scripts/dashboards_render_atlas.py --build # rebuild JSON + ConfigMaps scripts/dashboards_render_atlas.py # re-render ConfigMaps from JSON """ import argparse import json import textwrap import urllib.parse from pathlib import Path # --------------------------------------------------------------------------- # Paths, folders, and shared metadata # --------------------------------------------------------------------------- ROOT = Path(__file__).resolve().parents[1] DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards" CONFIG_TEMPLATE = textwrap.dedent( """# {relative_path} apiVersion: v1 kind: ConfigMap metadata: name: {name} labels: grafana_dashboard: "1" data: {key}: | {payload} """ ) PROM_DS = {"type": "prometheus", "uid": "atlas-vm"} PUBLIC_FOLDER = "overview" PRIVATE_FOLDER = "atlas-internal" PERCENT_THRESHOLDS = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, {"color": "orange", "value": 75}, {"color": "red", "value": 91.5}, ], } NAMESPACE_CPU_WINDOW = "1m" # --------------------------------------------------------------------------- # Cluster metadata # --------------------------------------------------------------------------- CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"] CONTROL_DEPENDENCIES = ["titan-db", "titan-jh"] CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES WORKER_NODES = [ "titan-04", "titan-05", "titan-06", "titan-07", "titan-08", "titan-09", "titan-10", "titan-11", "titan-20", "titan-21", "titan-12", "titan-13", "titan-14", "titan-15", "titan-17", "titan-18", "titan-19", "titan-22", "titan-24", ] CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES) CONTROL_ALL_REGEX = "|".join(CONTROL_ALL) WORKER_REGEX = "|".join(WORKER_NODES) CONTROL_TOTAL = len(CONTROL_PLANE_NODES) WORKER_TOTAL = len(WORKER_NODES) CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" # Namespaces considered infrastructure (excluded from workload counts) INFRA_PATTERNS = [ "kube-.*", ".*-system", "traefik", "monitoring", "logging", "cert-manager", "maintenance", "postgres", ] INFRA_REGEX = f"^({'|'.join(INFRA_PATTERNS)})$" # Namespaces allowed on control plane without counting as workloads CP_ALLOWED_NS = INFRA_REGEX LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4] CONTROL_WORKLOADS_EXPR = ( f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)' ) # --------------------------------------------------------------------------- # PromQL helpers # --------------------------------------------------------------------------- NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")' def node_filter(regex): """Return a selector that evaluates to 1 for nodes matching the regex.""" return ( f'label_replace(node_uname_info{{nodename=~"{regex}"}}, ' '"node", "$1", "nodename", "(.*)")' ) def scoped_node_expr(base, scope=""): """Attach nodename metadata and optionally filter to a scope regex.""" expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})" if scope: expr = f"({expr}) * on(node) group_left() {node_filter(scope)}" return expr def node_cpu_expr(scope=""): idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))' base = f"(1 - {idle}) * 100" return scoped_node_expr(base, scope) def node_mem_expr(scope=""): usage = ( "avg by (instance) (" "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) " "/ node_memory_MemTotal_bytes * 100)" ) return scoped_node_expr(usage, scope) def filesystem_usage_expr(mount, scope=""): base = ( f'avg by (instance) (' f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} ' f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)' ) return scoped_node_expr(base, scope) def root_usage_expr(scope=""): return filesystem_usage_expr("/", scope) def astreae_usage_expr(mount): return ( f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / " f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)" ) def astreae_free_expr(mount): return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" def topk_with_node(expr): return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")' def node_net_expr(scope=""): base = ( 'sum by (instance) (' 'rate(node_network_receive_bytes_total{device!~"lo"}[5m]) ' '+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))' ) return scoped_node_expr(base, scope) def node_io_expr(scope=""): base = ( "sum by (instance) (rate(node_disk_read_bytes_total[5m]) " "+ rate(node_disk_written_bytes_total[5m]))" ) return scoped_node_expr(base, scope) def namespace_selector(scope_var): return f'namespace!="",pod!="",container!="",container!="POD",{scope_var}' def namespace_gpu_selector(scope_var): return f'namespace!="",pod!="",{scope_var}' def namespace_cpu_raw(scope_var): return ( "sum(rate(container_cpu_usage_seconds_total" f"{{{namespace_selector(scope_var)}}}[{NAMESPACE_CPU_WINDOW}])) by (namespace)" ) def namespace_ram_raw(scope_var): return f"sum(container_memory_working_set_bytes{{{namespace_selector(scope_var)}}}) by (namespace)" def namespace_gpu_usage_instant(scope_var): return f"sum(DCGM_FI_DEV_GPU_UTIL{{{namespace_gpu_selector(scope_var)}}}) by (namespace)" def namespace_share_expr(resource_expr): total = f"clamp_min(sum( {resource_expr} ), 1)" return f"100 * ( {resource_expr} ) / {total}" def namespace_cpu_share_expr(scope_var): return namespace_share_expr(namespace_cpu_raw(scope_var)) def namespace_ram_share_expr(scope_var): return namespace_share_expr(namespace_ram_raw(scope_var)) def namespace_gpu_share_expr(scope_var): usage = namespace_gpu_usage_instant(scope_var) total = f"(sum({usage}) or on() vector(0))" share = f"100 * ({usage}) / clamp_min({total}, 1)" idle = 'label_replace(vector(100), "namespace", "idle", "", "") and on() (' + total + " == 0)" return f"({share}) or ({idle})" PROBLEM_PODS_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) ' "or on() vector(0)" ) CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' '{reason=~"CrashLoopBackOff|ImagePullBackOff"})) ' "or on() vector(0)" ) STUCK_TERMINATING_EXPR = ( 'sum(max by (namespace,pod) (' '((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)' ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)' ')) ' "or on() vector(0)" ) UPTIME_WINDOW = "365d" # Keep the subquery step coarse so we don't request an excessive number of points. UPTIME_STEP = "1h" TRAEFIK_READY_EXPR = ( "(" 'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})' " / clamp_min(" 'sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)' ")" ) CONTROL_READY_FRACTION_EXPR = ( f"(sum(kube_node_status_condition{{condition=\"Ready\",status=\"true\",node=~\"{CONTROL_REGEX}\"}})" f" / {CONTROL_TOTAL})" ) UPTIME_AVAIL_EXPR = ( f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))" ) # Tie-breaker to deterministically pick one node per namespace when shares tie. NODE_TIEBREAKER = " + ".join( f"({node_filter(node)}) * 1e-6 * {idx}" for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1) ) UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:{UPTIME_STEP}])" UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))" UPTIME_THRESHOLDS = { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": 2}, {"color": "yellow", "value": 3}, {"color": "green", "value": 3.5}, ], } UPTIME_PERCENT_THRESHOLDS = { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": 0.99}, {"color": "yellow", "value": 0.999}, {"color": "green", "value": 0.9999}, {"color": "blue", "value": 0.99999}, ], } PROBLEM_TABLE_EXPR = ( "(time() - kube_pod_created{pod!=\"\"}) " "* on(namespace,pod) group_left(node) kube_pod_info " "* on(namespace,pod) group_left(phase) " "max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" ) CRASHLOOP_TABLE_EXPR = ( "(time() - kube_pod_created{pod!=\"\"}) " "* on(namespace,pod) group_left(node) kube_pod_info " "* on(namespace,pod,container) group_left(reason) " "max by (namespace,pod,container,reason) " "(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" ) STUCK_TABLE_EXPR = ( "(" "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) " "and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) " "* on(namespace,pod) group_left(node) kube_pod_info" ")" ) NAMESPACE_SCOPE_WORKLOAD = f'namespace!~"{INFRA_REGEX}"' NAMESPACE_SCOPE_ALL = 'namespace=~".*"' NAMESPACE_SCOPE_INFRA = f'namespace=~"{INFRA_REGEX}"' NAMESPACE_SCOPE_VARS = ["namespace_scope_cpu", "namespace_scope_gpu", "namespace_scope_ram"] GLUE_LABEL = 'label_atlas_bstein_dev_glue="true"' GLUE_JOBS = f"kube_cronjob_labels{{{GLUE_LABEL}}}" GLUE_FILTER = f"and on(namespace,cronjob) {GLUE_JOBS}" GLUE_LAST_SUCCESS = f"(kube_cronjob_status_last_successful_time {GLUE_FILTER})" GLUE_LAST_SCHEDULE = f"(kube_cronjob_status_last_schedule_time {GLUE_FILTER})" GLUE_SUSPENDED = f"(kube_cronjob_spec_suspend {GLUE_FILTER}) == 1" GLUE_ACTIVE = f"(kube_cronjob_status_active {GLUE_FILTER})" GLUE_LAST_SUCCESS_AGE = f"(time() - {GLUE_LAST_SUCCESS})" GLUE_LAST_SCHEDULE_AGE = f"(time() - {GLUE_LAST_SCHEDULE})" GLUE_LAST_SUCCESS_AGE_HOURS = f"({GLUE_LAST_SUCCESS_AGE}) / 3600" GLUE_LAST_SCHEDULE_AGE_HOURS = f"({GLUE_LAST_SCHEDULE_AGE}) / 3600" GLUE_STALE_WINDOW_SEC = 36 * 3600 GLUE_STALE = f"({GLUE_LAST_SUCCESS_AGE} > bool {GLUE_STALE_WINDOW_SEC})" GLUE_MISSING = f"({GLUE_JOBS} unless on(namespace,cronjob) kube_cronjob_status_last_successful_time)" GLUE_STALE_ACTIVE = f"({GLUE_STALE} unless on(namespace,cronjob) {GLUE_SUSPENDED})" GLUE_MISSING_ACTIVE = f"({GLUE_MISSING} unless on(namespace,cronjob) {GLUE_SUSPENDED})" GLUE_STALE_COUNT = f"(sum({GLUE_STALE_ACTIVE}) + count({GLUE_MISSING_ACTIVE}))" GLUE_MISSING_COUNT = f"count({GLUE_MISSING_ACTIVE})" GLUE_SUSPENDED_COUNT = f"sum({GLUE_SUSPENDED})" ARIADNE_TASK_ERRORS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="error"}[24h]))' ARIADNE_TASK_SUCCESS_24H = 'sum by (task) (increase(ariadne_task_runs_total{status="ok"}[24h]))' ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS = "(time() - ariadne_schedule_last_success_timestamp_seconds) / 3600" ARIADNE_ACCESS_REQUESTS = "ariadne_access_requests_total" GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"] GPU_NODE_REGEX = "|".join(GPU_NODES) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" TRAEFIK_NET_INGRESS = ( 'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' " or on() vector(0)" ) TRAEFIK_NET_EGRESS = ( 'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))' " or on() vector(0)" ) NET_CLUSTER_RX = ( 'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))' " or on() vector(0)" ) NET_CLUSTER_TX = ( 'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))' " or on() vector(0)" ) PHYSICAL_NET_FILTER = 'device!~"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*"' NET_NODE_RX_PHYS = ( f'sum(rate(node_network_receive_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)' ) NET_NODE_TX_PHYS = ( f'sum(rate(node_network_transmit_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)' ) NET_TOTAL_EXPR = NET_NODE_TX_PHYS NET_INGRESS_EXPR = NET_NODE_RX_PHYS NET_EGRESS_EXPR = NET_NODE_TX_PHYS NET_INTERNAL_EXPR = ( 'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!=""}[5m]) ' '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))' ' or on() vector(0)' ) APISERVER_5XX_RATE = 'sum(rate(apiserver_request_total{code=~"5.."}[5m]))' APISERVER_P99_LATENCY_MS = ( "histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000" ) ETCD_P99_LATENCY_MS = ( "histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000" ) TRAEFIK_TOTAL_5M = "sum(rate(traefik_entrypoint_requests_total[5m]))" TRAEFIK_SUCCESS_5M = 'sum(rate(traefik_entrypoint_requests_total{code!~"5.."}[5m]))' TRAEFIK_SLI_5M = f"({TRAEFIK_SUCCESS_5M}) / clamp_min({TRAEFIK_TOTAL_5M}, 1)" TRAEFIK_P99_LATENCY_MS = ( "histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" ) TRAEFIK_P95_LATENCY_MS = ( "histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000" ) SLO_AVAILABILITY = 0.999 def traefik_sli(window): total = f'sum(rate(traefik_entrypoint_requests_total[{window}]))' success = f'sum(rate(traefik_entrypoint_requests_total{{code!~"5.."}}[{window}]))' return f"({success}) / clamp_min({total}, 1)" def traefik_burn(window): sli = traefik_sli(window) return f"(1 - ({sli})) / {1 - SLO_AVAILABILITY}" # --------------------------------------------------------------------------- # Panel factories # --------------------------------------------------------------------------- def stat_panel( panel_id, title, expr, grid, *, unit="none", decimals=None, thresholds=None, text_mode="value", legend=None, instant=False, value_suffix=None, links=None, ): """Return a Grafana stat panel definition.""" defaults = { "color": {"mode": "thresholds"}, "mappings": [], "thresholds": thresholds or { "mode": "absolute", "steps": [ {"color": "rgba(115, 115, 115, 1)", "value": None}, {"color": "green", "value": 1}, ], }, "unit": unit, "custom": {"displayMode": "auto"}, } if value_suffix: defaults["custom"]["valueSuffix"] = value_suffix if decimals is not None: defaults["decimals"] = decimals panel = { "id": panel_id, "type": "stat", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A"}], "fieldConfig": {"defaults": defaults, "overrides": []}, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "textMode": text_mode, }, } if legend: panel["targets"][0]["legendFormat"] = legend if instant: panel["targets"][0]["instant"] = True if links: panel["links"] = links return panel def gauge_panel( panel_id, title, expr, grid, *, min_value=0, max_value=1, thresholds=None, links=None, ): return { "id": panel_id, "type": "gauge", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A"}], "fieldConfig": { "defaults": { "min": min_value, "max": max_value, "thresholds": thresholds or { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": max_value}, ], }, }, "overrides": [], }, "options": { "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "orientation": "auto", "showThresholdMarkers": False, "showThresholdLabels": False, }, **({"links": links} if links else {}), } def timeseries_panel( panel_id, title, expr, grid, *, unit="none", legend=None, legend_display="table", legend_placement="bottom", legend_calcs=None, time_from=None, links=None, ): """Return a Grafana time-series panel definition.""" panel = { "id": panel_id, "type": "timeseries", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A"}], "fieldConfig": {"defaults": {"unit": unit}, "overrides": []}, "options": { "legend": { "displayMode": legend_display, "placement": legend_placement, }, "tooltip": {"mode": "multi"}, }, } if legend: panel["targets"][0]["legendFormat"] = legend if legend_calcs: panel["options"]["legend"]["calcs"] = legend_calcs if time_from: panel["timeFrom"] = time_from if links: panel["links"] = links return panel def table_panel( panel_id, title, expr, grid, *, unit="none", transformations=None, instant=False, options=None, filterable=True, footer=None, format=None, ): """Return a Grafana table panel definition.""" # Optional PromQL subquery helpers in expr: share(), etc. panel_options = {"showHeader": True, "columnFilters": False} if options: panel_options.update(options) if footer is not None: panel_options["footer"] = footer field_defaults = {"unit": unit, "custom": {"filterable": filterable}} target = {"expr": expr, "refId": "A", **({"instant": True} if instant else {})} if format: target["format"] = format panel = { "id": panel_id, "type": "table", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [target], "fieldConfig": {"defaults": field_defaults, "overrides": []}, "options": panel_options, } if transformations: panel["transformations"] = transformations return panel def pie_panel(panel_id, title, expr, grid, *, links=None, description=None): """Return a pie chart panel with readable namespace labels.""" panel = { "id": panel_id, "type": "piechart", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], "fieldConfig": { "defaults": { "unit": "percent", "color": {"mode": "palette-classic"}, }, "overrides": [], }, "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", "displayLabels": [], "tooltip": {"mode": "single"}, "colorScheme": "interpolateSpectral", "colorBy": "value", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, } if links: panel["links"] = links if description: panel["description"] = description return panel def namespace_scope_variable(var_name, label): options = [ { "text": "workload namespaces only", "value": NAMESPACE_SCOPE_WORKLOAD, "selected": True, }, {"text": "all namespaces", "value": NAMESPACE_SCOPE_ALL, "selected": False}, { "text": "infrastructure namespaces only", "value": NAMESPACE_SCOPE_INFRA, "selected": False, }, ] query = ( "workload namespaces only : " + NAMESPACE_SCOPE_WORKLOAD + ",all namespaces : " + NAMESPACE_SCOPE_ALL + ",infrastructure namespaces only : " + NAMESPACE_SCOPE_INFRA ) return { "name": var_name, "label": label, "type": "custom", "query": query, "current": {"text": options[0]["text"], "value": options[0]["value"], "selected": True}, "options": options, "hide": 2, "multi": False, "includeAll": False, "refresh": 1, "sort": 0, "skipUrlSync": False, } def namespace_scope_links(var_name): def with_value(value): encoded = urllib.parse.quote(value, safe="") params = [] for other in NAMESPACE_SCOPE_VARS: if other == var_name: params.append(f"var-{other}={encoded}") else: params.append(f"var-{other}=${{{other}}}") return "?" + "&".join(params) return [ {"title": "Workload namespaces only", "url": with_value(NAMESPACE_SCOPE_WORKLOAD), "targetBlank": False}, {"title": "All namespaces", "url": with_value(NAMESPACE_SCOPE_ALL), "targetBlank": False}, { "title": "Infrastructure namespaces only", "url": with_value(NAMESPACE_SCOPE_INFRA), "targetBlank": False, }, ] def bargauge_panel( panel_id, title, expr, grid, *, unit="none", links=None, limit=None, thresholds=None, decimals=None, instant=False, ): """Return a bar gauge panel with label-aware reduction.""" panel = { "id": panel_id, "type": "bargauge", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [ {"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})} ], "fieldConfig": { "defaults": { "unit": unit, "min": 0, "max": 100 if unit == "percent" else None, "thresholds": thresholds or { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, {"color": "orange", "value": 70}, {"color": "red", "value": 85}, ], }, }, "overrides": [], }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": False, }, }, } if decimals is not None: panel["fieldConfig"]["defaults"]["decimals"] = decimals if links: panel["links"] = links # Keep bars ordered by value descending for readability. panel["transformations"] = [ { "id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}, } ] if limit: panel["transformations"].append({"id": "limit", "options": {"limit": limit}}) return panel def text_panel(panel_id, title, content, grid): return { "id": panel_id, "type": "text", "title": title, "gridPos": grid, "datasource": None, "options": {"mode": "markdown", "content": content}, } def link_to(uid): return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}] # --------------------------------------------------------------------------- # Dashboard builders # --------------------------------------------------------------------------- def build_overview(): panels = [] count_thresholds = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, ], } row1_stats = [ { "id": 2, "title": "Control Plane Ready", "expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', "kind": "gauge", "max_value": CONTROL_TOTAL, "thresholds": { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "green", "value": CONTROL_TOTAL}, ], }, }, { "id": 3, "title": "Control Plane Workloads", "expr": CONTROL_WORKLOADS_EXPR, "kind": "stat", "thresholds": { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, ], }, "links": link_to("atlas-pods"), }, { "id": 5, "title": "Stuck Terminating", "expr": STUCK_TERMINATING_EXPR, "kind": "stat", "thresholds": { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, ], }, "links": link_to("atlas-pods"), }, { "id": 27, "title": "Atlas Availability", "expr": UPTIME_PERCENT_EXPR, "kind": "stat", "thresholds": UPTIME_PERCENT_THRESHOLDS, "unit": "percentunit", "decimals": 4, "text_mode": "value", }, { "id": 4, "title": "Problem Pods", "expr": PROBLEM_PODS_EXPR, "kind": "stat", "thresholds": { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, ], }, "links": link_to("atlas-pods"), }, { "id": 6, "title": "CrashLoop / ImagePull", "expr": CRASHLOOP_EXPR, "kind": "stat", "thresholds": { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, ], }, "links": link_to("atlas-pods"), }, { "id": 1, "title": "Workers Ready", "expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', "kind": "gauge", "max_value": WORKER_TOTAL, "thresholds": { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": WORKER_TOTAL - 2}, {"color": "yellow", "value": WORKER_TOTAL - 1}, {"color": "green", "value": WORKER_TOTAL}, ], }, }, ] def gauge_grid(idx): width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4 x = sum(GAUGE_WIDTHS[:idx]) return width, x for idx, item in enumerate(row1_stats): panel_id = item["id"] width, x = gauge_grid(idx) grid = {"h": 5, "w": width, "x": x, "y": 0} kind = item.get("kind", "gauge") if kind == "stat": panels.append( stat_panel( panel_id, item["title"], item["expr"], grid, thresholds=item.get("thresholds"), legend=None, links=item.get("links"), text_mode=item.get("text_mode", "value"), value_suffix=item.get("value_suffix"), unit=item.get("unit", "none"), decimals=item.get("decimals"), ) ) else: panels.append( gauge_panel( panel_id, item["title"], item["expr"], grid, min_value=0, max_value=item.get("max_value", 5), thresholds=item.get("thresholds"), links=item.get("links"), ) ) hottest = [ (7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"), (8, "Hottest node: RAM", topk_with_node(node_mem_expr()), "percent"), (9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"), (10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"), ] for idx, (panel_id, title, expr, unit) in enumerate(hottest): panels.append( stat_panel( panel_id, title, f"{expr}", {"h": 3, "w": 6, "x": 6 * idx, "y": 5}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="name_and_value", legend="{{node}}", instant=True, links=link_to("atlas-nodes"), ) ) mail_bounce_rate_thresholds = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 5}, {"color": "orange", "value": 8}, {"color": "red", "value": 10}, ], } mail_limit_thresholds = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 70}, {"color": "orange", "value": 85}, {"color": "red", "value": 95}, ], } mail_success_thresholds = { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": 90}, {"color": "yellow", "value": 95}, {"color": "green", "value": 98}, ], } panels.append( stat_panel( 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', {"h": 2, "w": 5, "x": 0, "y": 8}, unit="none", links=link_to("atlas-mail"), ) ) panels.append( { "id": 31, "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, "gridPos": {"h": 2, "w": 5, "x": 10, "y": 8}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', "refId": "A", "legendFormat": "Rate", }, { "expr": 'max(postmark_outbound_bounced{window="1d"})', "refId": "B", "legendFormat": "Count", }, ], "fieldConfig": { "defaults": { "color": {"mode": "thresholds"}, "custom": {"displayMode": "auto"}, "thresholds": mail_bounce_rate_thresholds, "unit": "none", }, "overrides": [ { "matcher": {"id": "byName", "options": "Rate"}, "properties": [{"id": "unit", "value": "percent"}], }, { "matcher": {"id": "byName", "options": "Count"}, "properties": [{"id": "unit", "value": "none"}], }, ], }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "textMode": "name_and_value", }, "links": link_to("atlas-mail"), } ) panels.append( stat_panel( 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', {"h": 2, "w": 5, "x": 5, "y": 8}, unit="percent", thresholds=mail_success_thresholds, decimals=1, links=link_to("atlas-mail"), ) ) panels.append( stat_panel( 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", {"h": 2, "w": 5, "x": 15, "y": 8}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, links=link_to("atlas-mail"), ) ) storage_panels = [ (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"), (25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"), (26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"), ] for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): panels.append( stat_panel( panel_id, title, expr, {"h": 6, "w": 6, "x": 6 * idx, "y": 10}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, links=link_to("atlas-storage"), ) ) cpu_scope = "$namespace_scope_cpu" gpu_scope = "$namespace_scope_gpu" ram_scope = "$namespace_scope_ram" panels.append( pie_panel( 11, "Namespace CPU Share", namespace_cpu_share_expr(cpu_scope), {"h": 9, "w": 8, "x": 0, "y": 16}, links=namespace_scope_links("namespace_scope_cpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) ) panels.append( pie_panel( 12, "Namespace GPU Share", namespace_gpu_share_expr(gpu_scope), {"h": 9, "w": 8, "x": 8, "y": 16}, links=namespace_scope_links("namespace_scope_gpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) ) panels.append( pie_panel( 13, "Namespace RAM Share", namespace_ram_share_expr(ram_scope), {"h": 9, "w": 8, "x": 16, "y": 16}, links=namespace_scope_links("namespace_scope_ram"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) ) worker_filter = f"{WORKER_REGEX}" panels.append( timeseries_panel( 14, "Worker Node CPU", node_cpu_expr(worker_filter), {"h": 12, "w": 12, "x": 0, "y": 32}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", links=link_to("atlas-nodes"), ) ) panels.append( timeseries_panel( 15, "Worker Node RAM", node_mem_expr(worker_filter), {"h": 12, "w": 12, "x": 12, "y": 32}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", links=link_to("atlas-nodes"), ) ) panels.append( timeseries_panel( 16, "Control plane CPU", node_cpu_expr(CONTROL_ALL_REGEX), {"h": 10, "w": 12, "x": 0, "y": 44}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 17, "Control plane RAM", node_mem_expr(CONTROL_ALL_REGEX), {"h": 10, "w": 12, "x": 12, "y": 44}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( pie_panel( 28, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', {"h": 10, "w": 12, "x": 0, "y": 54}, ) ) panels.append( bargauge_panel( 29, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', {"h": 10, "w": 12, "x": 12, "y": 54}, unit="none", limit=12, decimals=0, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, {"color": "orange", "value": 75}, {"color": "red", "value": 100}, ], }, instant=True, ) ) panels.append( timeseries_panel( 18, "Cluster Ingress Throughput", NET_INGRESS_EXPR, {"h": 7, "w": 8, "x": 0, "y": 25}, unit="Bps", legend="Ingress (Traefik)", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), ) ) panels.append( timeseries_panel( 19, "Cluster Egress Throughput", NET_EGRESS_EXPR, {"h": 7, "w": 8, "x": 8, "y": 25}, unit="Bps", legend="Egress (Traefik)", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), ) ) panels.append( timeseries_panel( 20, "Intra-Cluster Throughput", NET_INTERNAL_EXPR, {"h": 7, "w": 8, "x": 16, "y": 25}, unit="Bps", legend="Internal traffic", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), ) ) panels.append( timeseries_panel( 21, "Root Filesystem Usage", root_usage_expr(), {"h": 16, "w": 12, "x": 0, "y": 64}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", time_from="30d", links=link_to("atlas-storage"), ) ) panels.append( bargauge_panel( 22, "Nodes Closest to Full Root Disks", f"topk(12, {root_usage_expr()})", {"h": 16, "w": 12, "x": 12, "y": 64}, unit="percent", thresholds=PERCENT_THRESHOLDS, links=link_to("atlas-storage"), ) ) return { "uid": "atlas-overview", "title": "Atlas Overview", "folderUid": PUBLIC_FOLDER, "editable": False, "annotations": {"list": []}, "panels": panels, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "overview"], "templating": { "list": [ namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"), namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"), namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"), ] }, "time": {"from": "now-1h", "to": "now"}, "refresh": "1m", "links": [], } def build_pods_dashboard(): panels = [] panels.append( stat_panel( 1, "Problem Pods", PROBLEM_PODS_EXPR, {"h": 4, "w": 6, "x": 0, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": 1}, ], }, ) ) panels.append( stat_panel( 2, "CrashLoop / ImagePull", CRASHLOOP_EXPR, {"h": 4, "w": 6, "x": 6, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": 1}, ], }, ) ) panels.append( stat_panel( 3, "Stuck Terminating (>10m)", STUCK_TERMINATING_EXPR, {"h": 4, "w": 6, "x": 12, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": 1}, ], }, ) ) panels.append( stat_panel( 4, "Control Plane Workloads", f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', {"h": 4, "w": 6, "x": 18, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": 1}, ], }, ) ) panels.append( table_panel( 5, "Pods Not Running", PROBLEM_TABLE_EXPR, {"h": 10, "w": 24, "x": 0, "y": 4}, unit="s", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 6, "CrashLoop / ImagePull", CRASHLOOP_TABLE_EXPR, {"h": 10, "w": 24, "x": 0, "y": 14}, unit="s", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 7, "Terminating >10m", STUCK_TABLE_EXPR, {"h": 10, "w": 24, "x": 0, "y": 24}, unit="s", transformations=[ {"id": "labelsToFields", "options": {}}, {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, ], ) ) panels.append( pie_panel( 8, "Node Pod Share", '(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100', {"h": 8, "w": 12, "x": 12, "y": 34}, ) ) panels.append( bargauge_panel( 9, "Top Nodes by Pod Count", 'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))', {"h": 8, "w": 12, "x": 0, "y": 34}, unit="none", limit=12, decimals=0, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, {"color": "orange", "value": 75}, {"color": "red", "value": 100}, ], }, instant=True, ) ) share_expr = ( '(sum by (namespace,node) (kube_pod_info{pod!="" , node!=""}) ' '/ on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=""}), 1) * 100)' ) rank_terms = [ f"(sum by (node) (kube_node_info{{node=\"{node}\"}}) * 0 + {idx * 1e-3})" for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1) ] rank_expr = " or ".join(rank_terms) score_expr = f"{share_expr} + on(node) group_left() ({rank_expr})" mask_expr = ( f"{score_expr} == bool on(namespace) group_left() " f"(max by (namespace) ({score_expr}))" ) panels.append( table_panel( 10, "Namespace Plurality by Node v27", ( f"{share_expr} * on(namespace,node) group_left() " f"({mask_expr})" ), {"h": 8, "w": 24, "x": 0, "y": 42}, unit="percent", transformations=[ {"id": "labelsToFields", "options": {}}, {"id": "organize", "options": {"excludeByName": {"Time": True}}}, {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 0}}, { "id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}, }, { "id": "groupBy", "options": { "fields": { "namespace": { "aggregations": [ {"field": "Value", "operation": "max"}, {"field": "node", "operation": "first"}, ] } }, "rowBy": ["namespace"], }, }, ], instant=True, options={"showColumnFilters": False}, filterable=False, footer={"show": False, "fields": "", "calcs": []}, format="table", ) ) return { "uid": "atlas-pods", "title": "Atlas Pods", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "pods"], } def build_nodes_dashboard(): panels = [] panels.append( stat_panel( 1, "Worker Nodes Ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', {"h": 4, "w": 8, "x": 0, "y": 0}, value_suffix=WORKER_SUFFIX, ) ) panels.append( stat_panel( 2, "Control Plane Ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', {"h": 4, "w": 8, "x": 8, "y": 0}, value_suffix=CONTROL_SUFFIX, ) ) panels.append( stat_panel( 3, "Control Plane Workloads", f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', {"h": 4, "w": 8, "x": 16, "y": 0}, ) ) panels.append( stat_panel( 9, "API Server 5xx rate", APISERVER_5XX_RATE, {"h": 4, "w": 8, "x": 0, "y": 4}, unit="req/s", thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 0.05}, {"color": "orange", "value": 0.2}, {"color": "red", "value": 0.5}, ], }, decimals=3, ) ) panels.append( stat_panel( 10, "API Server P99 latency", APISERVER_P99_LATENCY_MS, {"h": 4, "w": 8, "x": 8, "y": 4}, unit="ms", thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 250}, {"color": "orange", "value": 400}, {"color": "red", "value": 600}, ], }, decimals=1, ) ) panels.append( stat_panel( 11, "etcd P99 latency", ETCD_P99_LATENCY_MS, {"h": 4, "w": 8, "x": 16, "y": 4}, unit="ms", thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, {"color": "orange", "value": 100}, {"color": "red", "value": 200}, ], }, decimals=1, ) ) panels.append( timeseries_panel( 4, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 8}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 5, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 17}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 6, "Control Plane (incl. titan-db) CPU", node_cpu_expr(CONTROL_ALL_REGEX), {"h": 9, "w": 12, "x": 0, "y": 26}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 7, "Control Plane (incl. titan-db) RAM", node_mem_expr(CONTROL_ALL_REGEX), {"h": 9, "w": 12, "x": 12, "y": 26}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 8, "Root Filesystem Usage", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 35}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d", ) ) return { "uid": "atlas-nodes", "title": "Atlas Nodes", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "nodes"], } def build_storage_dashboard(): panels = [] panels.append( stat_panel( 1, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent", thresholds=PERCENT_THRESHOLDS, ) ) panels.append( stat_panel( 2, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent", thresholds=PERCENT_THRESHOLDS, ) ) panels.append( stat_panel( 3, "Astreae Free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="decbytes", ) ) panels.append( stat_panel( 4, "Asteria Free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="decbytes", ) ) panels.append( timeseries_panel( 5, "Astreae Per-Node Usage", filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d", ) ) panels.append( timeseries_panel( 6, "Asteria Per-Node Usage", filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 12, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d", ) ) panels.append( timeseries_panel( 7, "Astreae Usage History", astreae_usage_expr("/mnt/astreae"), {"h": 9, "w": 12, "x": 0, "y": 14}, unit="percent", time_from="90d", ) ) panels.append( timeseries_panel( 8, "Asteria Usage History", astreae_usage_expr("/mnt/asteria"), {"h": 9, "w": 12, "x": 12, "y": 14}, unit="percent", time_from="90d", ) ) panels.append( stat_panel( 30, "Maintenance Sweepers Ready", 'kube_daemonset_status_number_ready{namespace="maintenance",daemonset="node-image-sweeper"} / on(namespace,daemonset) kube_daemonset_status_desired_number_scheduled{namespace="maintenance",daemonset="node-image-sweeper"} * 100', {"h": 4, "w": 12, "x": 0, "y": 44}, unit="percent", thresholds=PERCENT_THRESHOLDS, ) ) panels.append( stat_panel( 31, "Maintenance Cron Freshness (s)", 'time() - max by (cronjob) (kube_cronjob_status_last_successful_time{namespace="maintenance",cronjob="image-sweeper"})', {"h": 4, "w": 12, "x": 12, "y": 44}, unit="s", thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 3600}, {"color": "red", "value": 10800}, ], }, ) ) return { "uid": "atlas-storage", "title": "Atlas Storage", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "storage"], } def build_network_dashboard(): panels = [] panels.append( stat_panel( 1, "Ingress Success Rate (5m)", TRAEFIK_SLI_5M, {"h": 4, "w": 6, "x": 0, "y": 0}, unit="percentunit", decimals=2, thresholds={ "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": 0.995}, {"color": "yellow", "value": 0.999}, {"color": "green", "value": 0.9995}, ], }, ) ) panels.append( stat_panel( 2, "Error Budget Burn (1h)", traefik_burn("1h"), {"h": 4, "w": 6, "x": 6, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 4}, ], }, decimals=2, ) ) panels.append( stat_panel( 3, "Error Budget Burn (6h)", traefik_burn("6h"), {"h": 4, "w": 6, "x": 12, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 4}, ], }, decimals=2, ) ) panels.append( stat_panel( 4, "Edge P99 Latency (ms)", TRAEFIK_P99_LATENCY_MS, {"h": 4, "w": 6, "x": 18, "y": 0}, unit="ms", thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 200}, {"color": "orange", "value": 350}, {"color": "red", "value": 500}, ], }, decimals=1, ) ) panels.append( stat_panel( 5, "Ingress Traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 4}, unit="Bps", ) ) panels.append( stat_panel( 6, "Egress Traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 4}, unit="Bps", ) ) panels.append( stat_panel( 7, "Intra-Cluster Traffic", NET_INTERNAL_EXPR, {"h": 4, "w": 8, "x": 16, "y": 4}, unit="Bps", ) ) panels.append( timeseries_panel( 8, "Per-Node Throughput", f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})', {"h": 8, "w": 24, "x": 0, "y": 8}, unit="Bps", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( table_panel( 9, "Top Namespaces", 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 16}, unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 10, "Top Pods", 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', {"h": 9, "w": 12, "x": 12, "y": 16}, unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( timeseries_panel( 11, "Traefik Routers (req/s)", f"topk(10, {TRAEFIK_ROUTER_EXPR})", {"h": 9, "w": 12, "x": 0, "y": 25}, unit="req/s", legend="{{router}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 12, "Traefik Entrypoints (req/s)", 'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))', {"h": 9, "w": 12, "x": 12, "y": 25}, unit="req/s", legend="{{entrypoint}}", legend_display="table", legend_placement="right", ) ) return { "uid": "atlas-network", "title": "Atlas Network", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "network"], } def build_mail_dashboard(): panels = [] bounce_rate_thresholds = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 5}, {"color": "orange", "value": 8}, {"color": "red", "value": 10}, ], } limit_thresholds = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 70}, {"color": "orange", "value": 85}, {"color": "red", "value": 95}, ], } success_thresholds = { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": 90}, {"color": "yellow", "value": 95}, {"color": "green", "value": 98}, ], } panels.append( stat_panel( 1, "Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', {"h": 4, "w": 6, "x": 0, "y": 0}, decimals=0, ) ) panels.append( stat_panel( 2, "Sent (7d)", 'max(postmark_outbound_sent{window="7d"})', {"h": 4, "w": 6, "x": 6, "y": 0}, decimals=0, ) ) panels.append( { "id": 3, "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, "gridPos": {"h": 4, "w": 6, "x": 12, "y": 0}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', "refId": "A", "legendFormat": "Rate", }, { "expr": 'max(postmark_outbound_bounced{window="1d"})', "refId": "B", "legendFormat": "Count", }, ], "fieldConfig": { "defaults": { "color": {"mode": "thresholds"}, "custom": {"displayMode": "auto"}, "thresholds": bounce_rate_thresholds, "unit": "none", }, "overrides": [ { "matcher": {"id": "byName", "options": "Rate"}, "properties": [{"id": "unit", "value": "percent"}], }, { "matcher": {"id": "byName", "options": "Count"}, "properties": [{"id": "unit", "value": "none"}], }, ], }, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "textMode": "name_and_value", }, } ) panels.append( stat_panel( 4, "Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', {"h": 4, "w": 6, "x": 18, "y": 0}, unit="percent", thresholds=success_thresholds, decimals=1, ) ) panels.append( stat_panel( 5, "Limit Used (30d)", "max(postmark_sending_limit_used_percent)", {"h": 4, "w": 6, "x": 0, "y": 4}, thresholds=limit_thresholds, unit="percent", decimals=1, ) ) panels.append( stat_panel( 6, "Send Limit (30d)", "max(postmark_sending_limit)", {"h": 4, "w": 6, "x": 6, "y": 4}, decimals=0, ) ) panels.append( stat_panel( 7, "Last Success", "max(postmark_last_success_timestamp_seconds)", {"h": 4, "w": 6, "x": 12, "y": 4}, unit="dateTimeAsIso", decimals=0, ) ) panels.append( stat_panel( 8, "Exporter Errors", "sum(postmark_request_errors_total)", {"h": 4, "w": 6, "x": 18, "y": 4}, decimals=0, ) ) panels.append( timeseries_panel( 13, "Bounce Rate (1d vs 7d)", "max by (window) (postmark_outbound_bounce_rate)", {"h": 8, "w": 12, "x": 0, "y": 12}, unit="percent", legend="{{window}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 14, "Bounced (1d vs 7d)", "max by (window) (postmark_outbound_bounced)", {"h": 8, "w": 12, "x": 12, "y": 12}, unit="none", legend="{{window}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 15, "Sent (1d vs 7d)", "max by (window) (postmark_outbound_sent)", {"h": 8, "w": 12, "x": 0, "y": 20}, unit="none", legend="{{window}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 16, "Exporter Errors", "sum(postmark_request_errors_total)", {"h": 8, "w": 12, "x": 12, "y": 20}, unit="none", ) ) return { "uid": "atlas-mail", "title": "Atlas Mail", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-30d", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "mail"], } def build_testing_dashboard(): panels = [] sort_desc = [{"id": "labelsToFields", "options": {}}, {"id": "sortBy", "options": {"fields": ["Value"], "order": "desc"}}] panels.append( stat_panel( 1, "Glue Jobs Stale (>36h)", GLUE_STALE_COUNT, {"h": 4, "w": 6, "x": 0, "y": 0}, unit="none", thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, ], }, ) ) panels.append( table_panel( 2, "Glue Jobs Missing Success", GLUE_MISSING_ACTIVE, {"h": 4, "w": 6, "x": 6, "y": 0}, unit="none", transformations=sort_desc, instant=True, ) ) panels.append( table_panel( 3, "Glue Jobs Suspended", GLUE_SUSPENDED, {"h": 4, "w": 6, "x": 12, "y": 0}, unit="none", transformations=sort_desc, instant=True, ) ) panels.append( table_panel( 4, "Glue Jobs Active Runs", GLUE_ACTIVE, {"h": 4, "w": 6, "x": 18, "y": 0}, unit="none", transformations=sort_desc, instant=True, ) ) panels.append( table_panel( 5, "Glue Jobs Last Success (hours ago)", GLUE_LAST_SUCCESS_AGE_HOURS, {"h": 8, "w": 12, "x": 0, "y": 4}, unit="h", transformations=sort_desc, instant=True, ) ) panels.append( table_panel( 6, "Glue Jobs Last Schedule (hours ago)", GLUE_LAST_SCHEDULE_AGE_HOURS, {"h": 8, "w": 12, "x": 12, "y": 4}, unit="h", transformations=sort_desc, instant=True, ) ) panels.append( table_panel( 7, "Ariadne Task Errors (24h)", ARIADNE_TASK_ERRORS_24H, {"h": 6, "w": 12, "x": 0, "y": 12}, unit="none", transformations=sort_desc, instant=True, ) ) panels.append( table_panel( 8, "Ariadne Schedule Last Success (hours ago)", ARIADNE_SCHEDULE_LAST_SUCCESS_HOURS, {"h": 6, "w": 12, "x": 12, "y": 12}, unit="h", transformations=sort_desc, instant=True, ) ) panels.append( table_panel( 9, "Ariadne Access Requests", ARIADNE_ACCESS_REQUESTS, {"h": 4, "w": 24, "x": 0, "y": 18}, unit="none", transformations=sort_desc, instant=True, ) ) return { "uid": "atlas-testing", "title": "Atlas Testing", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-7d", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "testing"], } def build_gpu_dashboard(): panels = [] gpu_scope = "$namespace_scope_gpu" panels.append( pie_panel( 1, "Namespace GPU Share", namespace_gpu_share_expr(gpu_scope), {"h": 8, "w": 12, "x": 0, "y": 0}, links=namespace_scope_links("namespace_scope_gpu"), description="Shares are normalized within the selected filter. Switching scope changes the denominator.", ) ) panels.append( timeseries_panel( 2, "GPU Util by Namespace", namespace_gpu_usage_instant(gpu_scope), {"h": 8, "w": 12, "x": 12, "y": 0}, unit="percent", legend="{{namespace}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 3, "GPU Util by Node", 'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})', {"h": 8, "w": 12, "x": 0, "y": 8}, unit="percent", legend="{{Hostname}}", legend_display="table", legend_placement="right", ) ) panels.append( table_panel( 4, "Top Pods by GPU Util", 'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))', {"h": 8, "w": 12, "x": 12, "y": 8}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}], ) ) return { "uid": "atlas-gpu", "title": "Atlas GPU", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "gpu"], "templating": { "list": [ namespace_scope_variable("namespace_scope_cpu", "CPU namespace filter"), namespace_scope_variable("namespace_scope_gpu", "GPU namespace filter"), namespace_scope_variable("namespace_scope_ram", "RAM namespace filter"), ] }, } DASHBOARDS = { "atlas-overview": { "builder": build_overview, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml", }, "atlas-pods": { "builder": build_pods_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml", }, "atlas-nodes": { "builder": build_nodes_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml", }, "atlas-storage": { "builder": build_storage_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml", }, "atlas-network": { "builder": build_network_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml", }, "atlas-mail": { "builder": build_mail_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-mail.yaml", }, "atlas-testing": { "builder": build_testing_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-testing.yaml", }, "atlas-gpu": { "builder": build_gpu_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml", }, } def write_json(uid, data): DASHBOARD_DIR.mkdir(parents=True, exist_ok=True) path = DASHBOARD_DIR / f"{uid}.json" path.write_text(json.dumps(data, indent=2) + "\n") def render_configmap(uid, info): json_path = DASHBOARD_DIR / f"{uid}.json" payload = json.dumps(json.loads(json_path.read_text()), indent=2) indented = "\n".join(" " + line for line in payload.splitlines()) output_path = info["configmap"] content = CONFIG_TEMPLATE.format( relative_path=output_path.relative_to(ROOT), name=output_path.stem, key=json_path.name, payload=indented, ) output_path.write_text(content) print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}") def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders") args = parser.parse_args() if args.build: for uid, info in DASHBOARDS.items(): write_json(uid, info["builder"]()) for uid, info in DASHBOARDS.items(): render_configmap(uid, info) if __name__ == "__main__": main()