titan-iac/scripts/dashboards_render_atlas.py

1774 lines
52 KiB
Python

#!/usr/bin/env python3
"""Generate Atlas Grafana dashboards and render them into ConfigMaps.
Usage:
scripts/dashboards_render_atlas.py --build # rebuild JSON + ConfigMaps
scripts/dashboards_render_atlas.py # re-render ConfigMaps from JSON
"""
import argparse
import json
import textwrap
from pathlib import Path
# ---------------------------------------------------------------------------
# Paths, folders, and shared metadata
# ---------------------------------------------------------------------------
ROOT = Path(__file__).resolve().parents[1]
DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards"
CONFIG_TEMPLATE = textwrap.dedent(
"""# {relative_path}
apiVersion: v1
kind: ConfigMap
metadata:
name: {name}
labels:
grafana_dashboard: "1"
data:
{key}: |
{payload}
"""
)
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
PUBLIC_FOLDER = "overview"
PRIVATE_FOLDER = "atlas-internal"
PERCENT_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 91.5},
],
}
# ---------------------------------------------------------------------------
# Cluster metadata
# ---------------------------------------------------------------------------
CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"]
CONTROL_DEPENDENCIES = ["titan-db"]
CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
WORKER_NODES = [
"titan-04",
"titan-05",
"titan-06",
"titan-07",
"titan-08",
"titan-09",
"titan-10",
"titan-11",
"titan-12",
"titan-13",
"titan-14",
"titan-15",
"titan-16",
"titan-17",
"titan-18",
"titan-19",
"titan-22",
"titan-24",
]
CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES)
CONTROL_ALL_REGEX = "|".join(CONTROL_ALL)
WORKER_REGEX = "|".join(WORKER_NODES)
CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
WORKER_TOTAL = len(WORKER_NODES)
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
CONTROL_WORKLOADS_EXPR = (
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
)
# ---------------------------------------------------------------------------
# PromQL helpers
# ---------------------------------------------------------------------------
NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")'
def node_filter(regex):
"""Return a selector that evaluates to 1 for nodes matching the regex."""
return (
f'label_replace(node_uname_info{{nodename=~"{regex}"}}, '
'"node", "$1", "nodename", "(.*)")'
)
def scoped_node_expr(base, scope=""):
"""Attach nodename metadata and optionally filter to a scope regex."""
expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})"
if scope:
expr = f"({expr}) * on(node) group_left() {node_filter(scope)}"
return expr
def node_cpu_expr(scope=""):
idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))'
base = f"(1 - {idle}) * 100"
return scoped_node_expr(base, scope)
def node_mem_expr(scope=""):
usage = (
"avg by (instance) ("
"(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) "
"/ node_memory_MemTotal_bytes * 100)"
)
return scoped_node_expr(usage, scope)
def filesystem_usage_expr(mount, scope=""):
base = (
f'avg by (instance) ('
f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} '
f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)'
)
return scoped_node_expr(base, scope)
def root_usage_expr(scope=""):
return filesystem_usage_expr("/", scope)
def astreae_usage_expr(mount):
return (
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
)
def astreae_free_expr(mount):
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
def topk_with_node(expr):
return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")'
def node_net_expr(scope=""):
base = (
'sum by (instance) ('
'rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))'
)
return scoped_node_expr(base, scope)
def node_io_expr(scope=""):
base = (
"sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
"+ rate(node_disk_written_bytes_total[5m]))"
)
return scoped_node_expr(base, scope)
def namespace_share_expr(resource_expr):
selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
total = f"clamp_min(sum( {selected} ), 1)"
return f"100 * ( {selected} ) / {total}"
def namespace_cpu_share_expr():
return namespace_share_expr(NAMESPACE_CPU_RAW)
def namespace_ram_share_expr():
return namespace_share_expr(NAMESPACE_RAM_RAW)
def namespace_gpu_share_expr():
return namespace_share_expr(NAMESPACE_GPU_RAW)
PROBLEM_PODS_EXPR = (
'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) '
"or on() vector(0)"
)
CRASHLOOP_EXPR = (
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
'{reason=~"CrashLoopBackOff|ImagePullBackOff"})) '
"or on() vector(0)"
)
STUCK_TERMINATING_EXPR = (
'sum(max by (namespace,pod) ('
'((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
')) '
"or on() vector(0)"
)
UPTIME_WINDOW = "30d"
TRAEFIK_READY_EXPR = (
"("
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
" / clamp_min("
'sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)'
")"
)
CONTROL_READY_FRACTION_EXPR = (
f"(sum(kube_node_status_condition{{condition=\"Ready\",status=\"true\",node=~\"{CONTROL_REGEX}\"}})"
f" / {CONTROL_TOTAL})"
)
UPTIME_AVAIL_EXPR = (
f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))"
)
UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:5m])"
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
UPTIME_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 2},
{"color": "yellow", "value": 3},
{"color": "green", "value": 3.5},
],
}
UPTIME_PERCENT_THRESHOLDS = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 0.999},
{"color": "yellow", "value": 0.9999},
{"color": "green", "value": 0.99999},
],
}
PROBLEM_TABLE_EXPR = (
"(time() - kube_pod_created{pod!=\"\"}) "
"* on(namespace,pod) group_left(node) kube_pod_info "
"* on(namespace,pod) group_left(phase) "
"max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
)
CRASHLOOP_TABLE_EXPR = (
"(time() - kube_pod_created{pod!=\"\"}) "
"* on(namespace,pod) group_left(node) kube_pod_info "
"* on(namespace,pod,container) group_left(reason) "
"max by (namespace,pod,container,reason) "
"(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
)
STUCK_TABLE_EXPR = (
"("
"((time() - kube_pod_deletion_timestamp{pod!=\"\"}) "
"and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) "
"* on(namespace,pod) group_left(node) kube_pod_info"
")"
)
NAMESPACE_CPU_RAW = (
'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
)
NAMESPACE_RAM_RAW = (
'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
)
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
GPU_NODE_REGEX = "|".join(GPU_NODES)
NAMESPACE_GPU_ALLOC = (
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
)
NAMESPACE_GPU_USAGE_SHARE = (
'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))'
)
NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
NAMESPACE_GPU_RAW = (
"("
+ NAMESPACE_GPU_USAGE_SHARE
+ ") or on(namespace) ("
+ NAMESPACE_CPU_RAW
+ " * 0)"
)
NAMESPACE_GPU_WEIGHT = (
"("
+ NAMESPACE_GPU_ALLOC
+ ") or on(namespace) ("
+ NAMESPACE_CPU_RAW
+ " * 0)"
)
NAMESPACE_ACTIVITY_SCORE = (
"( "
+ NAMESPACE_CPU_RAW
+ " ) + ("
+ NAMESPACE_RAM_RAW
+ " / 1e9) + ("
+ NAMESPACE_GPU_WEIGHT
+ " * 100)"
)
NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
TRAEFIK_NET_INGRESS = (
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
" or on() vector(0)"
)
TRAEFIK_NET_EGRESS = (
'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
" or on() vector(0)"
)
NET_CLUSTER_RX = (
'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
" or on() vector(0)"
)
NET_CLUSTER_TX = (
'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
" or on() vector(0)"
)
PHYSICAL_NET_FILTER = 'device!~"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*"'
NET_NODE_RX_PHYS = (
f'sum(rate(node_network_receive_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
)
NET_NODE_TX_PHYS = (
f'sum(rate(node_network_transmit_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
)
NET_TOTAL_EXPR = NET_NODE_TX_PHYS
NET_INGRESS_EXPR = NET_NODE_RX_PHYS
NET_EGRESS_EXPR = NET_NODE_TX_PHYS
NET_INTERNAL_EXPR = (
'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!=""}[5m]) '
'+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
' or on() vector(0)'
)
APISERVER_5XX_RATE = 'sum(rate(apiserver_request_total{code=~"5.."}[5m]))'
APISERVER_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
)
ETCD_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_TOTAL_5M = "sum(rate(traefik_entrypoint_requests_total[5m]))"
TRAEFIK_SUCCESS_5M = 'sum(rate(traefik_entrypoint_requests_total{code!~"5.."}[5m]))'
TRAEFIK_SLI_5M = f"({TRAEFIK_SUCCESS_5M}) / clamp_min({TRAEFIK_TOTAL_5M}, 1)"
TRAEFIK_P99_LATENCY_MS = (
"histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
TRAEFIK_P95_LATENCY_MS = (
"histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
)
SLO_AVAILABILITY = 0.999
def traefik_sli(window):
total = f'sum(rate(traefik_entrypoint_requests_total[{window}]))'
success = f'sum(rate(traefik_entrypoint_requests_total{{code!~"5.."}}[{window}]))'
return f"({success}) / clamp_min({total}, 1)"
def traefik_burn(window):
sli = traefik_sli(window)
return f"(1 - ({sli})) / {1 - SLO_AVAILABILITY}"
# ---------------------------------------------------------------------------
# Panel factories
# ---------------------------------------------------------------------------
def stat_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
decimals=None,
thresholds=None,
text_mode="value",
legend=None,
instant=False,
value_suffix=None,
links=None,
):
"""Return a Grafana stat panel definition."""
defaults = {
"color": {"mode": "thresholds"},
"mappings": [],
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "rgba(115, 115, 115, 1)", "value": None},
{"color": "green", "value": 1},
],
},
"unit": unit,
"custom": {"displayMode": "auto"},
}
if value_suffix:
defaults["custom"]["valueSuffix"] = value_suffix
if decimals is not None:
defaults["decimals"] = decimals
panel = {
"id": panel_id,
"type": "stat",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": defaults, "overrides": []},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"textMode": text_mode,
},
}
if legend:
panel["targets"][0]["legendFormat"] = legend
if instant:
panel["targets"][0]["instant"] = True
if links:
panel["links"] = links
return panel
def gauge_panel(
panel_id,
title,
expr,
grid,
*,
min_value=0,
max_value=1,
thresholds=None,
links=None,
):
return {
"id": panel_id,
"type": "gauge",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {
"defaults": {
"min": min_value,
"max": max_value,
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": max_value},
],
},
},
"overrides": [],
},
"options": {
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"orientation": "auto",
"showThresholdMarkers": False,
"showThresholdLabels": False,
},
**({"links": links} if links else {}),
}
def timeseries_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
legend=None,
legend_display="table",
legend_placement="bottom",
legend_calcs=None,
time_from=None,
links=None,
):
"""Return a Grafana time-series panel definition."""
panel = {
"id": panel_id,
"type": "timeseries",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
"options": {
"legend": {
"displayMode": legend_display,
"placement": legend_placement,
},
"tooltip": {"mode": "multi"},
},
}
if legend:
panel["targets"][0]["legendFormat"] = legend
if legend_calcs:
panel["options"]["legend"]["calcs"] = legend_calcs
if time_from:
panel["timeFrom"] = time_from
if links:
panel["links"] = links
return panel
def table_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
transformations=None,
instant=False,
):
"""Return a Grafana table panel definition."""
# Optional PromQL subquery helpers in expr: share(), etc.
panel = {
"id": panel_id,
"type": "table",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A", **({"instant": True} if instant else {})}],
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
"options": {"showHeader": True},
}
if transformations:
panel["transformations"] = transformations
return panel
def pie_panel(panel_id, title, expr, grid):
"""Return a pie chart panel with readable namespace labels."""
return {
"id": panel_id,
"type": "piechart",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {"mode": "palette-classic"},
},
"overrides": [],
},
"options": {
"legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie",
"displayLabels": [],
"tooltip": {"mode": "single"},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
},
}
def bargauge_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
links=None,
limit=None,
thresholds=None,
decimals=None,
instant=False,
):
"""Return a bar gauge panel with label-aware reduction."""
panel = {
"id": panel_id,
"type": "bargauge",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
],
"fieldConfig": {
"defaults": {
"unit": unit,
"min": 0,
"max": 100 if unit == "percent" else None,
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 70},
{"color": "red", "value": 85},
],
},
},
"overrides": [],
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": False,
},
},
}
if decimals is not None:
panel["fieldConfig"]["defaults"]["decimals"] = decimals
if links:
panel["links"] = links
# Keep bars ordered by value descending for readability.
panel["transformations"] = [
{
"id": "sortBy",
"options": {"fields": ["Value"], "order": "desc"},
}
]
if limit:
panel["transformations"].append({"id": "limit", "options": {"limit": limit}})
return panel
def text_panel(panel_id, title, content, grid):
return {
"id": panel_id,
"type": "text",
"title": title,
"gridPos": grid,
"datasource": None,
"options": {"mode": "markdown", "content": content},
}
def link_to(uid):
return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}]
# ---------------------------------------------------------------------------
# Dashboard builders
# ---------------------------------------------------------------------------
def build_overview():
panels = []
count_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
}
row1_stats = [
{
"id": 2,
"title": "Control Plane Ready",
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
"kind": "gauge",
"max_value": CONTROL_TOTAL,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "green", "value": CONTROL_TOTAL},
],
},
},
{
"id": 3,
"title": "Control Plane Workloads",
"expr": CONTROL_WORKLOADS_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 5,
"title": "Stuck Terminating",
"expr": STUCK_TERMINATING_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 27,
"title": "Atlas Availability (30d)",
"expr": UPTIME_PERCENT_EXPR,
"kind": "stat",
"thresholds": UPTIME_PERCENT_THRESHOLDS,
"unit": "percentunit",
"decimals": 3,
"text_mode": "value",
},
{
"id": 4,
"title": "Problem Pods",
"expr": PROBLEM_PODS_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 6,
"title": "CrashLoop / ImagePull",
"expr": CRASHLOOP_EXPR,
"kind": "stat",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
},
"links": link_to("atlas-pods"),
},
{
"id": 1,
"title": "Workers Ready",
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
"kind": "gauge",
"max_value": WORKER_TOTAL,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": WORKER_TOTAL - 2},
{"color": "yellow", "value": WORKER_TOTAL - 1},
{"color": "green", "value": WORKER_TOTAL},
],
},
},
]
def gauge_grid(idx):
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
x = sum(GAUGE_WIDTHS[:idx])
return width, x
for idx, item in enumerate(row1_stats):
panel_id = item["id"]
width, x = gauge_grid(idx)
grid = {"h": 5, "w": width, "x": x, "y": 0}
kind = item.get("kind", "gauge")
if kind == "stat":
panels.append(
stat_panel(
panel_id,
item["title"],
item["expr"],
grid,
thresholds=item.get("thresholds"),
legend=None,
links=item.get("links"),
text_mode=item.get("text_mode", "value"),
value_suffix=item.get("value_suffix"),
unit=item.get("unit", "none"),
decimals=item.get("decimals"),
)
)
else:
panels.append(
gauge_panel(
panel_id,
item["title"],
item["expr"],
grid,
min_value=0,
max_value=item.get("max_value", 5),
thresholds=item.get("thresholds"),
links=item.get("links"),
)
)
hottest = [
(7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"),
(8, "Hottest node: RAM", topk_with_node(node_mem_expr()), "percent"),
(9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"),
(10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"),
]
for idx, (panel_id, title, expr, unit) in enumerate(hottest):
panels.append(
stat_panel(
panel_id,
title,
f"{expr}",
{"h": 3, "w": 6, "x": 6 * idx, "y": 5},
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
text_mode="name_and_value",
legend="{{node}}",
instant=True,
links=link_to("atlas-nodes"),
)
)
storage_panels = [
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
(24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
(25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"),
(26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"),
]
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
panels.append(
stat_panel(
panel_id,
title,
expr,
{"h": 6, "w": 6, "x": 6 * idx, "y": 10},
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
links=link_to("atlas-storage"),
)
)
panels.append(
pie_panel(
11,
"Namespace CPU Share",
namespace_cpu_share_expr(),
{"h": 9, "w": 8, "x": 0, "y": 16},
)
)
panels.append(
pie_panel(
12,
"Namespace GPU Share",
namespace_gpu_share_expr(),
{"h": 9, "w": 8, "x": 8, "y": 16},
)
)
panels.append(
pie_panel(
13,
"Namespace RAM Share",
namespace_ram_share_expr(),
{"h": 9, "w": 8, "x": 16, "y": 16},
)
)
worker_filter = f"{WORKER_REGEX}"
panels.append(
timeseries_panel(
14,
"Worker Node CPU",
node_cpu_expr(worker_filter),
{"h": 12, "w": 12, "x": 0, "y": 32},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
links=link_to("atlas-nodes"),
)
)
panels.append(
timeseries_panel(
15,
"Worker Node RAM",
node_mem_expr(worker_filter),
{"h": 12, "w": 12, "x": 12, "y": 32},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
links=link_to("atlas-nodes"),
)
)
panels.append(
timeseries_panel(
16,
"Control plane CPU",
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 0, "y": 44},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
17,
"Control plane RAM",
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 10, "w": 12, "x": 12, "y": 44},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
pie_panel(
28,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 10, "w": 12, "x": 0, "y": 54},
)
)
panels.append(
bargauge_panel(
29,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 10, "w": 12, "x": 12, "y": 54},
unit="none",
limit=12,
decimals=0,
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 100},
],
},
instant=True,
)
)
panels.append(
timeseries_panel(
18,
"Cluster Ingress Throughput",
NET_INGRESS_EXPR,
{"h": 7, "w": 8, "x": 0, "y": 25},
unit="Bps",
legend="Ingress (Traefik)",
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-network"),
)
)
panels.append(
timeseries_panel(
19,
"Cluster Egress Throughput",
NET_EGRESS_EXPR,
{"h": 7, "w": 8, "x": 8, "y": 25},
unit="Bps",
legend="Egress (Traefik)",
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-network"),
)
)
panels.append(
timeseries_panel(
20,
"Intra-Cluster Throughput",
NET_INTERNAL_EXPR,
{"h": 7, "w": 8, "x": 16, "y": 25},
unit="Bps",
legend="Internal traffic",
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-network"),
)
)
panels.append(
timeseries_panel(
21,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 16, "w": 12, "x": 0, "y": 64},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
time_from="30d",
links=link_to("atlas-storage"),
)
)
panels.append(
bargauge_panel(
22,
"Nodes Closest to Full Root Disks",
f"topk(12, {root_usage_expr()})",
{"h": 16, "w": 12, "x": 12, "y": 64},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
links=link_to("atlas-storage"),
)
)
return {
"uid": "atlas-overview",
"title": "Atlas Overview",
"folderUid": PUBLIC_FOLDER,
"editable": False,
"annotations": {"list": []},
"panels": panels,
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "overview"],
"templating": {"list": []},
"time": {"from": "now-1h", "to": "now"},
"refresh": "1m",
"links": [],
}
def build_pods_dashboard():
panels = []
panels.append(
stat_panel(
1,
"Problem Pods",
PROBLEM_PODS_EXPR,
{"h": 4, "w": 6, "x": 0, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
)
)
panels.append(
stat_panel(
2,
"CrashLoop / ImagePull",
CRASHLOOP_EXPR,
{"h": 4, "w": 6, "x": 6, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
)
)
panels.append(
stat_panel(
3,
"Stuck Terminating (>10m)",
STUCK_TERMINATING_EXPR,
{"h": 4, "w": 6, "x": 12, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
)
)
panels.append(
stat_panel(
4,
"Control Plane Workloads",
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
{"h": 4, "w": 6, "x": 18, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
)
)
panels.append(
table_panel(
5,
"Pods Not Running",
PROBLEM_TABLE_EXPR,
{"h": 10, "w": 24, "x": 0, "y": 4},
unit="s",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
6,
"CrashLoop / ImagePull",
CRASHLOOP_TABLE_EXPR,
{"h": 10, "w": 24, "x": 0, "y": 14},
unit="s",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
7,
"Terminating >10m",
STUCK_TABLE_EXPR,
{"h": 10, "w": 24, "x": 0, "y": 24},
unit="s",
transformations=[
{"id": "labelsToFields", "options": {}},
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
],
)
)
panels.append(
pie_panel(
8,
"Node Pod Share",
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
{"h": 8, "w": 12, "x": 12, "y": 34},
)
)
panels.append(
bargauge_panel(
9,
"Top Nodes by Pod Count",
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
{"h": 8, "w": 12, "x": 0, "y": 34},
unit="none",
limit=12,
decimals=0,
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 75},
{"color": "red", "value": 100},
],
},
instant=True,
)
)
panels.append(
table_panel(
10,
"Namespace Plurality by Node",
(
"("
" {share}"
" * on(namespace) group_left(node)"
" ({share} >= bool on(namespace) group_left() (max by (namespace) ({share}) - 1e-9))"
") * 100"
).format(
share=(
"(sum by (namespace,node) (kube_pod_info{pod!=\"\"}) "
"/ on(namespace) clamp_min(sum by (namespace) (kube_pod_info{pod!=\"\"}), 1))"
)
),
{"h": 8, "w": 24, "x": 0, "y": 42},
unit="percent",
transformations=[
{"id": "labelsToFields", "options": {}},
{"id": "sortBy", "options": {"fields": ["node", "Value"], "order": "asc"}},
],
instant=True,
)
)
return {
"uid": "atlas-pods",
"title": "Atlas Pods",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "pods"],
}
def build_nodes_dashboard():
panels = []
panels.append(
stat_panel(
1,
"Worker Nodes Ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
{"h": 4, "w": 8, "x": 0, "y": 0},
value_suffix=WORKER_SUFFIX,
)
)
panels.append(
stat_panel(
2,
"Control Plane Ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
{"h": 4, "w": 8, "x": 8, "y": 0},
value_suffix=CONTROL_SUFFIX,
)
)
panels.append(
stat_panel(
3,
"Control Plane Workloads",
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
{"h": 4, "w": 8, "x": 16, "y": 0},
)
)
panels.append(
stat_panel(
9,
"API Server 5xx rate",
APISERVER_5XX_RATE,
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 0.05},
{"color": "orange", "value": 0.2},
{"color": "red", "value": 0.5},
],
},
decimals=3,
)
)
panels.append(
stat_panel(
10,
"API Server P99 latency",
APISERVER_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 250},
{"color": "orange", "value": 400},
{"color": "red", "value": 600},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
11,
"etcd P99 latency",
ETCD_P99_LATENCY_MS,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 100},
{"color": "red", "value": 200},
],
},
decimals=1,
)
)
panels.append(
timeseries_panel(
4,
"Node CPU",
node_cpu_expr(),
{"h": 9, "w": 24, "x": 0, "y": 8},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
5,
"Node RAM",
node_mem_expr(),
{"h": 9, "w": 24, "x": 0, "y": 17},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
6,
"Control Plane (incl. titan-db) CPU",
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 0, "y": 26},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
7,
"Control Plane (incl. titan-db) RAM",
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 12, "y": 26},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
8,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 35},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
return {
"uid": "atlas-nodes",
"title": "Atlas Nodes",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "nodes"],
}
def build_storage_dashboard():
panels = []
panels.append(
stat_panel(
1,
"Astreae Usage",
astreae_usage_expr("/mnt/astreae"),
{"h": 5, "w": 6, "x": 0, "y": 0},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
)
)
panels.append(
stat_panel(
2,
"Asteria Usage",
astreae_usage_expr("/mnt/asteria"),
{"h": 5, "w": 6, "x": 6, "y": 0},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
)
)
panels.append(
stat_panel(
3,
"Astreae Free",
astreae_free_expr("/mnt/astreae"),
{"h": 5, "w": 6, "x": 12, "y": 0},
unit="decbytes",
)
)
panels.append(
stat_panel(
4,
"Asteria Free",
astreae_free_expr("/mnt/asteria"),
{"h": 5, "w": 6, "x": 18, "y": 0},
unit="decbytes",
)
)
panels.append(
timeseries_panel(
5,
"Astreae Per-Node Usage",
filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX),
{"h": 9, "w": 12, "x": 0, "y": 5},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
panels.append(
timeseries_panel(
6,
"Asteria Per-Node Usage",
filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX),
{"h": 9, "w": 12, "x": 12, "y": 5},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
panels.append(
timeseries_panel(
7,
"Astreae Usage History",
astreae_usage_expr("/mnt/astreae"),
{"h": 9, "w": 12, "x": 0, "y": 14},
unit="percent",
time_from="90d",
)
)
panels.append(
timeseries_panel(
8,
"Asteria Usage History",
astreae_usage_expr("/mnt/asteria"),
{"h": 9, "w": 12, "x": 12, "y": 14},
unit="percent",
time_from="90d",
)
)
return {
"uid": "atlas-storage",
"title": "Atlas Storage",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "storage"],
}
def build_network_dashboard():
panels = []
panels.append(
stat_panel(
1,
"Ingress Success Rate (5m)",
TRAEFIK_SLI_5M,
{"h": 4, "w": 6, "x": 0, "y": 0},
unit="percentunit",
decimals=2,
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 0.995},
{"color": "yellow", "value": 0.999},
{"color": "green", "value": 0.9995},
],
},
)
)
panels.append(
stat_panel(
2,
"Error Budget Burn (1h)",
traefik_burn("1h"),
{"h": 4, "w": 6, "x": 6, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
)
)
panels.append(
stat_panel(
3,
"Error Budget Burn (6h)",
traefik_burn("6h"),
{"h": 4, "w": 6, "x": 12, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 4},
],
},
decimals=2,
)
)
panels.append(
stat_panel(
4,
"Edge P99 Latency (ms)",
TRAEFIK_P99_LATENCY_MS,
{"h": 4, "w": 6, "x": 18, "y": 0},
unit="ms",
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 200},
{"color": "orange", "value": 350},
{"color": "red", "value": 500},
],
},
decimals=1,
)
)
panels.append(
stat_panel(
5,
"Ingress Traffic",
NET_INGRESS_EXPR,
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="Bps",
)
)
panels.append(
stat_panel(
6,
"Egress Traffic",
NET_EGRESS_EXPR,
{"h": 4, "w": 8, "x": 8, "y": 4},
unit="Bps",
)
)
panels.append(
stat_panel(
7,
"Intra-Cluster Traffic",
NET_INTERNAL_EXPR,
{"h": 4, "w": 8, "x": 16, "y": 4},
unit="Bps",
)
)
panels.append(
timeseries_panel(
8,
"Per-Node Throughput",
f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
{"h": 8, "w": 24, "x": 0, "y": 8},
unit="Bps",
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
table_panel(
9,
"Top Namespaces",
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
{"h": 9, "w": 12, "x": 0, "y": 16},
unit="Bps",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
10,
"Top Pods",
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
{"h": 9, "w": 12, "x": 12, "y": 16},
unit="Bps",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
timeseries_panel(
11,
"Traefik Routers (req/s)",
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
{"h": 9, "w": 12, "x": 0, "y": 25},
unit="req/s",
legend="{{router}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
12,
"Traefik Entrypoints (req/s)",
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
{"h": 9, "w": 12, "x": 12, "y": 25},
unit="req/s",
legend="{{entrypoint}}",
legend_display="table",
legend_placement="right",
)
)
return {
"uid": "atlas-network",
"title": "Atlas Network",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "network"],
}
def build_gpu_dashboard():
panels = []
panels.append(
pie_panel(
1,
"Namespace GPU Share",
namespace_gpu_share_expr(),
{"h": 8, "w": 12, "x": 0, "y": 0},
)
)
panels.append(
timeseries_panel(
2,
"GPU Util by Namespace",
NAMESPACE_GPU_USAGE_INSTANT,
{"h": 8, "w": 12, "x": 12, "y": 0},
unit="percent",
legend="{{namespace}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
3,
"GPU Util by Node",
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
{"h": 8, "w": 12, "x": 0, "y": 8},
unit="percent",
legend="{{Hostname}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
table_panel(
4,
"Top Pods by GPU Util",
'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
{"h": 8, "w": 12, "x": 12, "y": 8},
unit="percent",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
return {
"uid": "atlas-gpu",
"title": "Atlas GPU",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "gpu"],
}
DASHBOARDS = {
"atlas-overview": {
"builder": build_overview,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml",
},
"atlas-pods": {
"builder": build_pods_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml",
},
"atlas-nodes": {
"builder": build_nodes_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml",
},
"atlas-storage": {
"builder": build_storage_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml",
},
"atlas-network": {
"builder": build_network_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
},
"atlas-gpu": {
"builder": build_gpu_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
},
}
def write_json(uid, data):
DASHBOARD_DIR.mkdir(parents=True, exist_ok=True)
path = DASHBOARD_DIR / f"{uid}.json"
path.write_text(json.dumps(data, indent=2) + "\n")
def render_configmap(uid, info):
json_path = DASHBOARD_DIR / f"{uid}.json"
payload = json.dumps(json.loads(json_path.read_text()), indent=2)
indented = "\n".join(" " + line for line in payload.splitlines())
output_path = info["configmap"]
content = CONFIG_TEMPLATE.format(
relative_path=output_path.relative_to(ROOT),
name=output_path.stem,
key=json_path.name,
payload=indented,
)
output_path.write_text(content)
print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}")
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders")
args = parser.parse_args()
if args.build:
for uid, info in DASHBOARDS.items():
write_json(uid, info["builder"]())
for uid, info in DASHBOARDS.items():
render_configmap(uid, info)
if __name__ == "__main__":
main()