titan-iac/scripts/dashboards_render_atlas.py

#!/usr/bin/env python3
"""Generate Atlas Grafana dashboards and render them into ConfigMaps.

Usage:
  scripts/dashboards_render_atlas.py --build   # rebuild JSON + ConfigMaps
  scripts/dashboards_render_atlas.py           # re-render ConfigMaps from JSON
"""

import argparse
import json
import textwrap
from pathlib import Path

# ---------------------------------------------------------------------------
# Paths, folders, and shared metadata
# ---------------------------------------------------------------------------

ROOT = Path(__file__).resolve().parents[1]
DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards"
CONFIG_TEMPLATE = textwrap.dedent(
    """# {relative_path}
apiVersion: v1
kind: ConfigMap
metadata:
  name: {name}
  labels:
    grafana_dashboard: "1"
data:
  {key}: |
{payload}
"""
)

PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
PUBLIC_FOLDER = "overview"
PRIVATE_FOLDER = "atlas-internal"

PERCENT_THRESHOLDS = {
    "mode": "percentage",
    "steps": [
        {"color": "green", "value": None},
        {"color": "yellow", "value": 70},
        {"color": "red", "value": 85},
    ],
}

# ---------------------------------------------------------------------------
# Cluster metadata
# ---------------------------------------------------------------------------

CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"]
CONTROL_DEPENDENCIES = ["titan-db"]
CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
WORKER_NODES = [
    "titan-04",
    "titan-05",
    "titan-06",
    "titan-07",
    "titan-08",
    "titan-09",
    "titan-10",
    "titan-11",
    "titan-12",
    "titan-13",
    "titan-14",
    "titan-15",
    "titan-16",
    "titan-17",
    "titan-18",
    "titan-19",
    "titan-22",
    "titan-24",
]

CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES)
CONTROL_ALL_REGEX = "|".join(CONTROL_ALL)
WORKER_REGEX = "|".join(WORKER_NODES)
CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
WORKER_TOTAL = len(WORKER_NODES)
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
GAUGE_WIDTHS = [5, 5, 5, 5, 4]
CONTROL_WORKLOADS_EXPR = (
    f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
)

# ---------------------------------------------------------------------------
# PromQL helpers
# ---------------------------------------------------------------------------

NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")'


def node_filter(regex):
    """Return a selector that evaluates to 1 for nodes matching the regex."""
    return (
        f'label_replace(node_uname_info{{nodename=~"{regex}"}}, '
        '"node", "$1", "nodename", "(.*)")'
    )


def scoped_node_expr(base, scope=""):
    """Attach nodename metadata and optionally filter to a scope regex."""
    expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})"
    if scope:
        expr = f"({expr}) * on(node) group_left() {node_filter(scope)}"
    return expr


def node_cpu_expr(scope=""):
    idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))'
    base = f"(1 - {idle}) * 100"
    return scoped_node_expr(base, scope)


def node_mem_expr(scope=""):
    usage = (
        "avg by (instance) ("
        "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) "
        "/ node_memory_MemTotal_bytes * 100)"
    )
    return scoped_node_expr(usage, scope)


def filesystem_usage_expr(mount, scope=""):
    base = (
        f'avg by (instance) ('
        f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} '
        f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)'
    )
    return scoped_node_expr(base, scope)


def root_usage_expr(scope=""):
    return filesystem_usage_expr("/", scope)


def astreae_usage_expr(mount):
    return (
        f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
        f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
    )


def astreae_free_expr(mount):
    return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"


def topk_with_node(expr):
    return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")'


def node_net_expr(scope=""):
    base = (
        'sum by (instance) ('
        'rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
        '+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))'
    )
    return scoped_node_expr(base, scope)


def node_io_expr(scope=""):
    base = (
        "sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
        "+ rate(node_disk_written_bytes_total[5m]))"
    )
    return scoped_node_expr(base, scope)


def namespace_share_expr(resource_expr):
    selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
    total = f"clamp_min(sum( {selected} ), 1)"
    return f"100 * ( {selected} ) / {total}"


def namespace_cpu_share_expr():
    return namespace_share_expr(NAMESPACE_CPU_RAW)


def namespace_ram_share_expr():
    return namespace_share_expr(NAMESPACE_RAM_RAW)


def namespace_gpu_share_expr():
    return namespace_share_expr(NAMESPACE_GPU_RAW)


PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
CRASHLOOP_EXPR = (
    'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
    '{reason=~"CrashLoopBackOff|ImagePullBackOff"}))'
)
STUCK_TERMINATING_EXPR = (
    'sum(max by (namespace,pod) ('
    '((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
    ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
    '))'
)
PROBLEM_TABLE_EXPR = (
    "(time() - kube_pod_created{pod!=\"\"}) "
    "* on(namespace,pod) group_left(node) kube_pod_info "
    "* on(namespace,pod) group_left(phase) "
    "max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
)
CRASHLOOP_TABLE_EXPR = (
    "(time() - kube_pod_created{pod!=\"\"}) "
    "* on(namespace,pod) group_left(node) kube_pod_info "
    "* on(namespace,pod,container) group_left(reason) "
    "max by (namespace,pod,container,reason) "
    "(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
)
STUCK_TABLE_EXPR = (
    "("
    "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) "
    "and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) "
    "* on(namespace,pod) group_left(node) kube_pod_info"
    ")"
)

NAMESPACE_CPU_RAW = (
    'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
)
NAMESPACE_RAM_RAW = (
    'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
)
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
GPU_NODE_REGEX = "|".join(GPU_NODES)
NAMESPACE_GPU_ALLOC = (
    'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
    ' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
)
NAMESPACE_GPU_USAGE_SHARE = (
    'sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))'
)
NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
NAMESPACE_GPU_RAW = (
    "("
    + NAMESPACE_GPU_USAGE_SHARE
    + ") or on(namespace) ("
    + NAMESPACE_CPU_RAW
    + " * 0)"
)
NAMESPACE_GPU_WEIGHT = (
    "("
    + NAMESPACE_GPU_ALLOC
    + ") or on(namespace) ("
    + NAMESPACE_CPU_RAW
    + " * 0)"
)
NAMESPACE_ACTIVITY_SCORE = (
    "( "
    + NAMESPACE_CPU_RAW
    + " ) + ("
    + NAMESPACE_RAM_RAW
    + " / 1e9) + ("
    + NAMESPACE_GPU_WEIGHT
    + " * 100)"
)
NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
TRAEFIK_NET_INGRESS = (
    'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
    " or on() vector(0)"
)
TRAEFIK_NET_EGRESS = (
    'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
    " or on() vector(0)"
)
NET_CLUSTER_RX = (
    'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
    " or on() vector(0)"
)
NET_CLUSTER_TX = (
    'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
    " or on() vector(0)"
)
PHYSICAL_NET_FILTER = 'device!~"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*"'
NET_NODE_RX_PHYS = (
    f'sum(rate(node_network_receive_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
)
NET_NODE_TX_PHYS = (
    f'sum(rate(node_network_transmit_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
)
NET_TOTAL_EXPR = NET_NODE_TX_PHYS
NET_INGRESS_EXPR = NET_NODE_RX_PHYS
NET_EGRESS_EXPR = NET_NODE_TX_PHYS
NET_INTERNAL_EXPR = (
    'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!=""}[5m]) '
    '+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
    ' or on() vector(0)'
)

# ---------------------------------------------------------------------------
# Panel factories
# ---------------------------------------------------------------------------


def stat_panel(
    panel_id,
    title,
    expr,
    grid,
    *,
    unit="none",
    thresholds=None,
    text_mode="value",
    legend=None,
    instant=False,
    value_suffix=None,
    links=None,
):
    """Return a Grafana stat panel definition."""
    defaults = {
        "color": {"mode": "palette-classic"},
        "mappings": [],
        "thresholds": thresholds
        or {
            "mode": "absolute",
            "steps": [
                {"color": "rgba(115, 115, 115, 1)", "value": None},
                {"color": "green", "value": 1},
            ],
        },
        "unit": unit,
        "custom": {"displayMode": "auto"},
    }
    if value_suffix:
        defaults["custom"]["valueSuffix"] = value_suffix
    panel = {
        "id": panel_id,
        "type": "stat",
        "title": title,
        "datasource": PROM_DS,
        "gridPos": grid,
        "targets": [{"expr": expr, "refId": "A"}],
        "fieldConfig": {"defaults": defaults, "overrides": []},
        "options": {
            "colorMode": "value",
            "graphMode": "area",
            "justifyMode": "center",
            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
            "textMode": text_mode,
        },
    }
    if legend:
        panel["targets"][0]["legendFormat"] = legend
    if instant:
        panel["targets"][0]["instant"] = True
    if links:
        panel["links"] = links
    return panel


def gauge_panel(
    panel_id,
    title,
    expr,
    grid,
    *,
    min_value=0,
    max_value=1,
    thresholds=None,
    links=None,
):
    return {
        "id": panel_id,
        "type": "gauge",
        "title": title,
        "datasource": PROM_DS,
        "gridPos": grid,
        "targets": [{"expr": expr, "refId": "A"}],
        "fieldConfig": {
            "defaults": {
                "min": min_value,
                "max": max_value,
                "thresholds": thresholds
                or {
                    "mode": "absolute",
                    "steps": [
                        {"color": "green", "value": None},
                        {"color": "red", "value": max_value},
                    ],
                },
            },
            "overrides": [],
        },
        "options": {
            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
            "orientation": "auto",
            "showThresholdMarkers": False,
            "showThresholdLabels": False,
        },
        **({"links": links} if links else {}),
    }


def timeseries_panel(
    panel_id,
    title,
    expr,
    grid,
    *,
    unit="none",
    legend=None,
    legend_display="table",
    legend_placement="bottom",
    legend_calcs=None,
    time_from=None,
    links=None,
):
    """Return a Grafana time-series panel definition."""
    panel = {
        "id": panel_id,
        "type": "timeseries",
        "title": title,
        "datasource": PROM_DS,
        "gridPos": grid,
        "targets": [{"expr": expr, "refId": "A"}],
        "fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
        "options": {
            "legend": {
                "displayMode": legend_display,
                "placement": legend_placement,
            },
            "tooltip": {"mode": "multi"},
        },
    }
    if legend:
        panel["targets"][0]["legendFormat"] = legend
    if legend_calcs:
        panel["options"]["legend"]["calcs"] = legend_calcs
    if time_from:
        panel["timeFrom"] = time_from
    if links:
        panel["links"] = links
    return panel


def table_panel(
    panel_id,
    title,
    expr,
    grid,
    *,
    unit="none",
    transformations=None,
):
    """Return a Grafana table panel definition."""
    panel = {
        "id": panel_id,
        "type": "table",
        "title": title,
        "datasource": PROM_DS,
        "gridPos": grid,
        "targets": [{"expr": expr, "refId": "A"}],
        "fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
        "options": {"showHeader": True},
    }
    if transformations:
        panel["transformations"] = transformations
    return panel


def pie_panel(panel_id, title, expr, grid):
    """Return a pie chart panel with readable namespace labels."""
    return {
        "id": panel_id,
        "type": "piechart",
        "title": title,
        "datasource": PROM_DS,
        "gridPos": grid,
        "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
        "fieldConfig": {
            "defaults": {
                "unit": "percent",
                "color": {"mode": "palette-classic"},
            },
            "overrides": [],
        },
        "options": {
            "legend": {"displayMode": "list", "placement": "right"},
            "pieType": "pie",
            "displayLabels": ["percent"],
            "tooltip": {"mode": "single"},
            "colorScheme": "interpolateSpectral",
            "colorBy": "value",
            "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
        },
    }


def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
    """Return a bar gauge panel with label-aware reduction."""
    panel = {
        "id": panel_id,
        "type": "bargauge",
        "title": title,
        "datasource": PROM_DS,
        "gridPos": grid,
        "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}],
        "fieldConfig": {
            "defaults": {
                "unit": unit,
                "min": 0,
                "max": 100 if unit == "percent" else None,
                "thresholds": {
                    "mode": "absolute",
                    "steps": [
                        {"color": "green", "value": None},
                        {"color": "yellow", "value": 50},
                        {"color": "orange", "value": 70},
                        {"color": "red", "value": 85},
                    ],
                },
            },
            "overrides": [],
        },
        "options": {
            "displayMode": "gradient",
            "orientation": "horizontal",
            "reduceOptions": {
                "calcs": ["lastNotNull"],
                "fields": "",
                "values": False,
            },
        },
    }
    if links:
        panel["links"] = links
    return panel


def text_panel(panel_id, title, content, grid):
    return {
        "id": panel_id,
        "type": "text",
        "title": title,
        "gridPos": grid,
        "datasource": None,
        "options": {"mode": "markdown", "content": content},
    }


def link_to(uid):
    return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}]


# ---------------------------------------------------------------------------
# Dashboard builders
# ---------------------------------------------------------------------------


def build_overview():
    panels = []

    row1_stats = [
        (
            1,
            "Workers Ready",
            f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
            WORKER_SUFFIX,
            WORKER_TOTAL,
            None,
        ),
        (
            2,
            "Control Plane Ready",
            f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
            CONTROL_SUFFIX,
            CONTROL_TOTAL,
            None,
        ),
        (
            3,
            "Control Plane Workloads",
            CONTROL_WORKLOADS_EXPR,
            None,
            4,
            link_to("atlas-pods"),
        ),
        (
            4,
            "Problem Pods",
            PROBLEM_PODS_EXPR,
            None,
            1,
            link_to("atlas-pods"),
        ),
        (
            5,
            "Stuck Terminating",
            STUCK_TERMINATING_EXPR,
            None,
            1,
            link_to("atlas-pods"),
        ),
    ]

    def gauge_grid(idx):
        width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
        x = sum(GAUGE_WIDTHS[:idx])
        return width, x

    for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
        thresholds = None
        min_value = 0
        max_value = ok_value or 5
        if panel_id == 1:
            max_value = WORKER_TOTAL
            thresholds = {
                "mode": "absolute",
                "steps": [
                    {"color": "red", "value": None},
                    {"color": "orange", "value": WORKER_TOTAL - 2},
                    {"color": "yellow", "value": WORKER_TOTAL - 1},
                    {"color": "green", "value": WORKER_TOTAL},
                ],
            }
        elif panel_id == 2:
            max_value = CONTROL_TOTAL
            thresholds = {
                "mode": "absolute",
                "steps": [
                    {"color": "red", "value": None},
                    {"color": "green", "value": CONTROL_TOTAL},
                ],
            }
        elif panel_id in (3, 4, 5):
            max_value = 4
            thresholds = {
                "mode": "absolute",
                "steps": [
                    {"color": "green", "value": None},
                    {"color": "yellow", "value": 1},
                    {"color": "orange", "value": 2},
                    {"color": "red", "value": 3},
                ],
            }
        else:
            thresholds = {
                "mode": "absolute",
                "steps": [
                    {"color": "green", "value": None},
                    {"color": "red", "value": max_value},
                ],
            }
        width, x = gauge_grid(idx)
        if panel_id in (3, 4, 5):
            panels.append(
                stat_panel(
                    panel_id,
                    title,
                    expr,
                    {"h": 5, "w": width, "x": x, "y": 0},
                    thresholds=thresholds,
                    legend=None,
                    links=links,
                    text_mode="value",
                )
            )
        else:
            panels.append(
                gauge_panel(
                    panel_id,
                    title,
                    expr,
                    {"h": 5, "w": width, "x": x, "y": 0},
                    min_value=min_value,
                    max_value=max_value,
                    thresholds=thresholds,
                    links=links,
                )
            )

    hottest = [
        (7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"),
        (8, "Hottest node: RAM", topk_with_node(node_mem_expr()), "percent"),
        (9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"),
        (10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"),
    ]
    for idx, (panel_id, title, expr, unit) in enumerate(hottest):
        panels.append(
            stat_panel(
                panel_id,
                title,
                f"{expr}",
                {"h": 3, "w": 6, "x": 6 * idx, "y": 5},
                unit=unit,
                thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                text_mode="name_and_value",
                legend="{{node}}",
                instant=True,
                links=link_to("atlas-nodes"),
            )
        )

    storage_panels = [
        (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
        (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
        (25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"),
        (26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"),
    ]
    for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
        panels.append(
            stat_panel(
                panel_id,
                title,
                expr,
                {"h": 6, "w": 6, "x": 6 * idx, "y": 10},
                unit=unit,
                thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
                links=link_to("atlas-storage"),
            )
        )

    panels.append(
        pie_panel(
            11,
            "Namespace CPU Share",
            namespace_cpu_share_expr(),
            {"h": 9, "w": 8, "x": 0, "y": 16},
        )
    )
    panels.append(
        pie_panel(
            12,
            "Namespace GPU Share",
            namespace_gpu_share_expr(),
            {"h": 9, "w": 8, "x": 8, "y": 16},
        )
    )
    panels.append(
        pie_panel(
            13,
            "Namespace RAM Share",
            namespace_ram_share_expr(),
            {"h": 9, "w": 8, "x": 16, "y": 16},
        )
    )

    worker_filter = f"{WORKER_REGEX}"
    panels.append(
        timeseries_panel(
            14,
            "Worker Node CPU",
            node_cpu_expr(worker_filter),
            {"h": 8, "w": 12, "x": 0, "y": 32},
            unit="percent",
            legend="{{node}}",
            legend_calcs=["last"],
            legend_display="table",
            legend_placement="right",
            links=link_to("atlas-nodes"),
        )
    )
    panels.append(
        timeseries_panel(
            15,
            "Worker Node RAM",
            node_mem_expr(worker_filter),
            {"h": 8, "w": 12, "x": 12, "y": 32},
            unit="percent",
            legend="{{node}}",
            legend_calcs=["last"],
            legend_display="table",
            legend_placement="right",
            links=link_to("atlas-nodes"),
        )
    )

    panels.append(
        timeseries_panel(
            16,
            "Control plane CPU",
            node_cpu_expr(CONTROL_REGEX),
            {"h": 7, "w": 12, "x": 0, "y": 40},
            unit="percent",
            legend="{{node}}",
            legend_display="table",
            legend_placement="right",
        )
    )
    panels.append(
        timeseries_panel(
            17,
            "Control plane RAM",
            node_mem_expr(CONTROL_REGEX),
            {"h": 7, "w": 12, "x": 12, "y": 40},
            unit="percent",
            legend="{{node}}",
            legend_display="table",
            legend_placement="right",
        )
    )

    panels.append(
        timeseries_panel(
            18,
            "Cluster Ingress Throughput",
            NET_INGRESS_EXPR,
            {"h": 7, "w": 8, "x": 0, "y": 25},
            unit="Bps",
            legend="Ingress (Traefik)",
            legend_display="list",
            legend_placement="bottom",
            links=link_to("atlas-network"),
        )
    )
    panels.append(
        timeseries_panel(
            19,
            "Cluster Egress Throughput",
            NET_EGRESS_EXPR,
            {"h": 7, "w": 8, "x": 8, "y": 25},
            unit="Bps",
            legend="Egress (Traefik)",
            legend_display="list",
            legend_placement="bottom",
            links=link_to("atlas-network"),
        )
    )
    panels.append(
        timeseries_panel(
            20,
            "Intra-Cluster Throughput",
            NET_INTERNAL_EXPR,
            {"h": 7, "w": 8, "x": 16, "y": 25},
            unit="Bps",
            legend="Internal traffic",
            legend_display="list",
            legend_placement="bottom",
            links=link_to("atlas-network"),
        )
    )

    panels.append(
        timeseries_panel(
            21,
            "Root Filesystem Usage",
            root_usage_expr(),
            {"h": 8, "w": 12, "x": 0, "y": 47},
            unit="percent",
            legend="{{node}}",
            legend_calcs=["last"],
            legend_display="table",
            legend_placement="right",
            time_from="30d",
            links=link_to("atlas-storage"),
        )
    )
    panels.append(
        bargauge_panel(
            22,
            "Nodes Closest to Full Root Disks",
            f"topk(8, {root_usage_expr()})",
            {"h": 8, "w": 12, "x": 12, "y": 47},
            unit="percent",
            links=link_to("atlas-storage"),
        )
    )

    return {
        "uid": "atlas-overview",
        "title": "Atlas Overview",
        "folderUid": PUBLIC_FOLDER,
        "editable": False,
        "annotations": {"list": []},
        "panels": panels,
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "overview"],
        "templating": {"list": []},
        "time": {"from": "now-1h", "to": "now"},
        "refresh": "1m",
        "links": [
            {"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
            {"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
            {"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
            {"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False},
            {"title": "Atlas GPU", "type": "dashboard", "dashboardUid": "atlas-gpu", "keepTime": False},
        ],
    }


def build_pods_dashboard():
    panels = []
    panels.append(
        stat_panel(
            1,
            "Problem Pods",
            PROBLEM_PODS_EXPR,
            {"h": 4, "w": 6, "x": 0, "y": 0},
            thresholds={
                "mode": "absolute",
                "steps": [
                    {"color": "green", "value": None},
                    {"color": "red", "value": 1},
                ],
            },
        )
    )
    panels.append(
        stat_panel(
            2,
            "CrashLoop / ImagePull",
            CRASHLOOP_EXPR,
            {"h": 4, "w": 6, "x": 6, "y": 0},
            thresholds={
                "mode": "absolute",
                "steps": [
                    {"color": "green", "value": None},
                    {"color": "red", "value": 1},
                ],
            },
        )
    )
    panels.append(
        stat_panel(
            3,
            "Stuck Terminating (>10m)",
            STUCK_TERMINATING_EXPR,
            {"h": 4, "w": 6, "x": 12, "y": 0},
            thresholds={
                "mode": "absolute",
                "steps": [
                    {"color": "green", "value": None},
                    {"color": "red", "value": 1},
                ],
            },
        )
    )
    panels.append(
        stat_panel(
            4,
            "Control Plane Workloads",
            f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
            {"h": 4, "w": 6, "x": 18, "y": 0},
            thresholds={
                "mode": "absolute",
                "steps": [
                    {"color": "green", "value": None},
                    {"color": "red", "value": 1},
                ],
            },
        )
    )

    panels.append(
        table_panel(
            5,
            "Pods Not Running",
            PROBLEM_TABLE_EXPR,
            {"h": 10, "w": 24, "x": 0, "y": 4},
            unit="s",
            transformations=[{"id": "labelsToFields", "options": {}}],
        )
    )
    panels.append(
        table_panel(
            6,
            "CrashLoop / ImagePull",
            CRASHLOOP_TABLE_EXPR,
            {"h": 10, "w": 24, "x": 0, "y": 14},
            unit="s",
            transformations=[{"id": "labelsToFields", "options": {}}],
        )
    )
    panels.append(
        table_panel(
            7,
            "Terminating >10m",
            STUCK_TABLE_EXPR,
            {"h": 10, "w": 24, "x": 0, "y": 24},
            unit="s",
            transformations=[
                {"id": "labelsToFields", "options": {}},
                {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
            ],
        )
    )
    return {
        "uid": "atlas-pods",
        "title": "Atlas Pods",
        "folderUid": PRIVATE_FOLDER,
        "editable": True,
        "panels": panels,
        "time": {"from": "now-12h", "to": "now"},
        "annotations": {"list": []},
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "pods"],
    }


def build_nodes_dashboard():
    panels = []
    panels.append(
        stat_panel(
            1,
            "Worker Nodes Ready",
            f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
            {"h": 4, "w": 8, "x": 0, "y": 0},
            value_suffix=WORKER_SUFFIX,
        )
    )
    panels.append(
        stat_panel(
            2,
            "Control Plane Ready",
            f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
            {"h": 4, "w": 8, "x": 8, "y": 0},
            value_suffix=CONTROL_SUFFIX,
        )
    )
    panels.append(
        stat_panel(
            3,
            "Control Plane Workloads",
            f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
            {"h": 4, "w": 8, "x": 16, "y": 0},
        )
    )
    panels.append(
        timeseries_panel(
            4,
            "Node CPU",
            node_cpu_expr(),
            {"h": 9, "w": 24, "x": 0, "y": 4},
            unit="percent",
            legend="{{node}}",
            legend_calcs=["last"],
            legend_display="table",
            legend_placement="right",
        )
    )
    panels.append(
        timeseries_panel(
            5,
            "Node RAM",
            node_mem_expr(),
            {"h": 9, "w": 24, "x": 0, "y": 13},
            unit="percent",
            legend="{{node}}",
            legend_calcs=["last"],
            legend_display="table",
            legend_placement="right",
        )
    )
    panels.append(
        timeseries_panel(
            6,
            "Control Plane (incl. titan-db) CPU",
            node_cpu_expr(CONTROL_ALL_REGEX),
            {"h": 9, "w": 12, "x": 0, "y": 22},
            unit="percent",
            legend="{{node}}",
            legend_display="table",
            legend_placement="right",
        )
    )
    panels.append(
        timeseries_panel(
            7,
            "Control Plane (incl. titan-db) RAM",
            node_mem_expr(CONTROL_ALL_REGEX),
            {"h": 9, "w": 12, "x": 12, "y": 22},
            unit="percent",
            legend="{{node}}",
            legend_display="table",
            legend_placement="right",
        )
    )
    panels.append(
        timeseries_panel(
            8,
            "Root Filesystem Usage",
            root_usage_expr(),
            {"h": 9, "w": 24, "x": 0, "y": 31},
            unit="percent",
            legend="{{node}}",
            legend_display="table",
            legend_placement="right",
            time_from="30d",
        )
    )
    return {
        "uid": "atlas-nodes",
        "title": "Atlas Nodes",
        "folderUid": PRIVATE_FOLDER,
        "editable": True,
        "panels": panels,
        "time": {"from": "now-12h", "to": "now"},
        "annotations": {"list": []},
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "nodes"],
    }


def build_storage_dashboard():
    panels = []
    panels.append(
        stat_panel(
            1,
            "Astreae Usage",
            astreae_usage_expr("/mnt/astreae"),
            {"h": 5, "w": 6, "x": 0, "y": 0},
            unit="percent",
            thresholds=PERCENT_THRESHOLDS,
        )
    )
    panels.append(
        stat_panel(
            2,
            "Asteria Usage",
            astreae_usage_expr("/mnt/asteria"),
            {"h": 5, "w": 6, "x": 6, "y": 0},
            unit="percent",
            thresholds=PERCENT_THRESHOLDS,
        )
    )
    panels.append(
        stat_panel(
            3,
            "Astreae Free",
            astreae_free_expr("/mnt/astreae"),
            {"h": 5, "w": 6, "x": 12, "y": 0},
            unit="decbytes",
        )
    )
    panels.append(
        stat_panel(
            4,
            "Asteria Free",
            astreae_free_expr("/mnt/asteria"),
            {"h": 5, "w": 6, "x": 18, "y": 0},
            unit="decbytes",
        )
    )
    panels.append(
        timeseries_panel(
            5,
            "Astreae Per-Node Usage",
            filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX),
            {"h": 9, "w": 12, "x": 0, "y": 5},
            unit="percent",
            legend="{{node}}",
            legend_display="table",
            legend_placement="right",
            time_from="30d",
        )
    )
    panels.append(
        timeseries_panel(
            6,
            "Asteria Per-Node Usage",
            filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX),
            {"h": 9, "w": 12, "x": 12, "y": 5},
            unit="percent",
            legend="{{node}}",
            legend_display="table",
            legend_placement="right",
            time_from="30d",
        )
    )
    panels.append(
        timeseries_panel(
            7,
            "Astreae Usage History",
            astreae_usage_expr("/mnt/astreae"),
            {"h": 9, "w": 12, "x": 0, "y": 14},
            unit="percent",
            time_from="90d",
        )
    )
    panels.append(
        timeseries_panel(
            8,
            "Asteria Usage History",
            astreae_usage_expr("/mnt/asteria"),
            {"h": 9, "w": 12, "x": 12, "y": 14},
            unit="percent",
            time_from="90d",
        )
    )
    return {
        "uid": "atlas-storage",
        "title": "Atlas Storage",
        "folderUid": PRIVATE_FOLDER,
        "editable": True,
        "panels": panels,
        "time": {"from": "now-12h", "to": "now"},
        "annotations": {"list": []},
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "storage"],
    }


def build_network_dashboard():
    panels = []
    panels.append(
        stat_panel(
            1,
            "Ingress Traffic",
            NET_INGRESS_EXPR,
            {"h": 4, "w": 8, "x": 0, "y": 0},
            unit="Bps",
        )
    )
    panels.append(
        stat_panel(
            2,
            "Egress Traffic",
            NET_EGRESS_EXPR,
            {"h": 4, "w": 8, "x": 8, "y": 0},
            unit="Bps",
        )
    )
    panels.append(
        stat_panel(
            3,
            "Intra-Cluster Traffic",
            NET_INTERNAL_EXPR,
            {"h": 4, "w": 8, "x": 16, "y": 0},
            unit="Bps",
        )
    )
    panels.append(
        stat_panel(
            4,
            "Top Router req/s",
            f"topk(1, {TRAEFIK_ROUTER_EXPR})",
            {"h": 4, "w": 8, "x": 0, "y": 4},
            unit="req/s",
            legend="{{router}}",
        )
    )
    panels.append(
        timeseries_panel(
            5,
            "Per-Node Throughput",
            f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
            {"h": 8, "w": 24, "x": 0, "y": 8},
            unit="Bps",
            legend="{{node}}",
            legend_display="table",
            legend_placement="right",
        )
    )
    panels.append(
        table_panel(
            6,
            "Top Namespaces",
            'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
            '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
            {"h": 9, "w": 12, "x": 0, "y": 16},
            unit="Bps",
            transformations=[{"id": "labelsToFields", "options": {}}],
        )
    )
    panels.append(
        table_panel(
            7,
            "Top Pods",
            'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
            '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
            {"h": 9, "w": 12, "x": 12, "y": 16},
            unit="Bps",
            transformations=[{"id": "labelsToFields", "options": {}}],
        )
    )
    panels.append(
        timeseries_panel(
            8,
            "Traefik Routers (req/s)",
            f"topk(10, {TRAEFIK_ROUTER_EXPR})",
            {"h": 9, "w": 12, "x": 0, "y": 25},
            unit="req/s",
            legend="{{router}}",
            legend_display="table",
            legend_placement="right",
        )
    )
    panels.append(
        timeseries_panel(
            9,
            "Traefik Entrypoints (req/s)",
            'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
            {"h": 9, "w": 12, "x": 12, "y": 25},
            unit="req/s",
            legend="{{entrypoint}}",
            legend_display="table",
            legend_placement="right",
        )
    )
    return {
        "uid": "atlas-network",
        "title": "Atlas Network",
        "folderUid": PRIVATE_FOLDER,
        "editable": True,
        "panels": panels,
        "time": {"from": "now-12h", "to": "now"},
        "annotations": {"list": []},
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "network"],
    }


def build_gpu_dashboard():
    panels = []
    panels.append(
        pie_panel(
            1,
            "Namespace GPU Share",
            namespace_gpu_share_expr(),
            {"h": 8, "w": 12, "x": 0, "y": 0},
        )
    )
    panels.append(
        timeseries_panel(
            2,
            "GPU Util by Namespace",
            NAMESPACE_GPU_USAGE_INSTANT,
            {"h": 8, "w": 12, "x": 12, "y": 0},
            unit="percent",
            legend="{{namespace}}",
            legend_display="table",
            legend_placement="right",
        )
    )
    panels.append(
        timeseries_panel(
            3,
            "GPU Util by Node",
            'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
            {"h": 8, "w": 12, "x": 0, "y": 8},
            unit="percent",
            legend="{{Hostname}}",
            legend_display="table",
            legend_placement="right",
        )
    )
    panels.append(
        table_panel(
            4,
            "Top Pods by GPU Util",
            'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
            {"h": 8, "w": 12, "x": 12, "y": 8},
            unit="percent",
            transformations=[{"id": "labelsToFields", "options": {}}],
        )
    )
    return {
        "uid": "atlas-gpu",
        "title": "Atlas GPU",
        "folderUid": PRIVATE_FOLDER,
        "editable": True,
        "panels": panels,
        "time": {"from": "now-12h", "to": "now"},
        "annotations": {"list": []},
        "schemaVersion": 39,
        "style": "dark",
        "tags": ["atlas", "gpu"],
    }


DASHBOARDS = {
    "atlas-overview": {
        "builder": build_overview,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml",
    },
    "atlas-pods": {
        "builder": build_pods_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml",
    },
    "atlas-nodes": {
        "builder": build_nodes_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml",
    },
    "atlas-storage": {
        "builder": build_storage_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml",
    },
    "atlas-network": {
        "builder": build_network_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
    },
    "atlas-gpu": {
        "builder": build_gpu_dashboard,
        "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
    },
}


def write_json(uid, data):
    DASHBOARD_DIR.mkdir(parents=True, exist_ok=True)
    path = DASHBOARD_DIR / f"{uid}.json"
    path.write_text(json.dumps(data, indent=2) + "\n")


def render_configmap(uid, info):
    json_path = DASHBOARD_DIR / f"{uid}.json"
    payload = json.dumps(json.loads(json_path.read_text()), indent=2)
    indented = "\n".join("    " + line for line in payload.splitlines())
    output_path = info["configmap"]
    content = CONFIG_TEMPLATE.format(
        relative_path=output_path.relative_to(ROOT),
        name=output_path.stem,
        key=json_path.name,
        payload=indented,
    )
    output_path.write_text(content)
    print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}")


def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders")
    args = parser.parse_args()

    if args.build:
        for uid, info in DASHBOARDS.items():
            write_json(uid, info["builder"]())

    for uid, info in DASHBOARDS.items():
        render_configmap(uid, info)


if __name__ == "__main__":
    main()