#!/usr/bin/env python3 """Generate Atlas Grafana dashboards and render them into ConfigMaps. Usage: scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps scripts/render_dashboards.py # re-render ConfigMaps from JSON """ import argparse import json import textwrap from pathlib import Path # --------------------------------------------------------------------------- # Paths, folders, and shared metadata # --------------------------------------------------------------------------- ROOT = Path(__file__).resolve().parents[1] DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards" CONFIG_TEMPLATE = textwrap.dedent( """# {relative_path} apiVersion: v1 kind: ConfigMap metadata: name: {name} labels: grafana_dashboard: "1" data: {key}: | {payload} """ ) PROM_DS = {"type": "prometheus", "uid": "atlas-vm"} PUBLIC_FOLDER = "atlas-overview" PRIVATE_FOLDER = "atlas-internal" PERCENT_THRESHOLDS = { "mode": "percentage", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}, ], } # --------------------------------------------------------------------------- # Cluster metadata # --------------------------------------------------------------------------- CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"] CONTROL_DEPENDENCIES = ["titan-db"] CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES WORKER_NODES = [ "titan-04", "titan-05", "titan-06", "titan-07", "titan-08", "titan-09", "titan-10", "titan-11", "titan-12", "titan-13", "titan-14", "titan-15", "titan-16", "titan-17", "titan-18", "titan-19", "titan-22", "titan-24", ] CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES) CONTROL_ALL_REGEX = "|".join(CONTROL_ALL) WORKER_REGEX = "|".join(WORKER_NODES) CONTROL_TOTAL = len(CONTROL_PLANE_NODES) WORKER_TOTAL = len(WORKER_NODES) CONTROL_SUFFIX = f"/{CONTROL_TOTAL}" WORKER_SUFFIX = f"/{WORKER_TOTAL}" CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring" LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]" # --------------------------------------------------------------------------- # PromQL helpers # --------------------------------------------------------------------------- NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")' def node_filter(regex): """Return a selector that evaluates to 1 for nodes matching the regex.""" return ( f'label_replace(node_uname_info{{nodename=~"{regex}"}}, ' '"node", "$1", "nodename", "(.*)")' ) def scoped_node_expr(base, scope=""): """Attach nodename metadata and optionally filter to a scope regex.""" expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})" if scope: expr = f"({expr}) * on(node) group_left() {node_filter(scope)}" return expr def node_cpu_expr(scope=""): idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))' base = f"(1 - {idle}) * 100" return scoped_node_expr(base, scope) def node_mem_expr(scope=""): usage = ( "avg by (instance) (" "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) " "/ node_memory_MemTotal_bytes * 100)" ) return scoped_node_expr(usage, scope) def filesystem_usage_expr(mount, scope=""): base = ( f'avg by (instance) (' f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} ' f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)' ) return scoped_node_expr(base, scope) def root_usage_expr(scope=""): return filesystem_usage_expr("/", scope) def astreae_usage_expr(mount): return ( f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / " f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)" ) def astreae_free_expr(mount): return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" def topk_with_node(expr): return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")' def node_net_expr(scope=""): base = ( 'sum by (instance) (' 'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) ' '+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))' ) return scoped_node_expr(base, scope) def node_io_expr(scope=""): base = ( "sum by (instance) (rate(node_disk_read_bytes_total[5m]) " "+ rate(node_disk_written_bytes_total[5m]))" ) return scoped_node_expr(base, scope) PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))' CRASHLOOP_EXPR = ( 'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason' '{reason=~"CrashLoopBackOff|ImagePullBackOff"}))' ) STUCK_TERMINATING_EXPR = ( 'sum(max by (namespace,pod) (' '((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)' ' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)' '))' ) PROBLEM_TABLE_EXPR = ( "(time() - kube_pod_created{pod!=\"\"}) " "* on(namespace,pod) group_left(node) kube_pod_info " "* on(namespace,pod) group_left(phase) " "max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})" ) CRASHLOOP_TABLE_EXPR = ( "(time() - kube_pod_created{pod!=\"\"}) " "* on(namespace,pod) group_left(node) kube_pod_info " "* on(namespace,pod,container) group_left(reason) " "max by (namespace,pod,container,reason) " "(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})" ) STUCK_TABLE_EXPR = ( "(" "((time() - kube_pod_deletion_timestamp{pod!=\"\"}) " "and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) " "* on(namespace,pod) group_left(node) kube_pod_info" ")" ) NAMESPACE_CPU_EXPR = ( 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=""' ',pod!=""}[5m])) by (namespace))' ) NAMESPACE_RAM_EXPR = ( 'topk(10, sum(container_memory_working_set_bytes{namespace!=""' ',pod!=""}) by (namespace))' ) TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))" NET_INGRESS_EXPR = ( 'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) ' "or on() vector(0)" ) NET_EGRESS_EXPR = ( 'sum(rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m])) ' "or on() vector(0)" ) # --------------------------------------------------------------------------- # Panel factories # --------------------------------------------------------------------------- def stat_panel( panel_id, title, expr, grid, *, unit="none", thresholds=None, text_mode="value", legend=None, instant=False, value_suffix=None, links=None, ): """Return a Grafana stat panel definition.""" defaults = { "color": {"mode": "palette-classic"}, "mappings": [], "thresholds": thresholds or { "mode": "absolute", "steps": [ {"color": "rgba(115, 115, 115, 1)", "value": None}, {"color": "green", "value": 1}, ], }, "unit": unit, "custom": {"displayMode": "auto"}, } if value_suffix: defaults["custom"]["valueSuffix"] = value_suffix panel = { "id": panel_id, "type": "stat", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A"}], "fieldConfig": {"defaults": defaults, "overrides": []}, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "textMode": text_mode, }, } if legend: panel["targets"][0]["legendFormat"] = legend if instant: panel["targets"][0]["instant"] = True if links: panel["links"] = links return panel def timeseries_panel( panel_id, title, expr, grid, *, unit="none", legend=None, legend_display="table", legend_placement="bottom", legend_calcs=None, time_from=None, links=None, ): """Return a Grafana time-series panel definition.""" panel = { "id": panel_id, "type": "timeseries", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A"}], "fieldConfig": {"defaults": {"unit": unit}, "overrides": []}, "options": { "legend": { "displayMode": legend_display, "placement": legend_placement, }, "tooltip": {"mode": "multi"}, }, } if legend: panel["targets"][0]["legendFormat"] = legend if legend_calcs: panel["options"]["legend"]["calcs"] = legend_calcs if time_from: panel["timeFrom"] = time_from if links: panel["links"] = links return panel def table_panel( panel_id, title, expr, grid, *, unit="none", transformations=None, ): """Return a Grafana table panel definition.""" panel = { "id": panel_id, "type": "table", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A"}], "fieldConfig": {"defaults": {"unit": unit}, "overrides": []}, "options": {"showHeader": True}, } if transformations: panel["transformations"] = transformations return panel def pie_panel(panel_id, title, expr, grid): """Return a pie chart panel with readable namespace labels.""" return { "id": panel_id, "type": "piechart", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []}, "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, } def text_panel(panel_id, title, content, grid): return { "id": panel_id, "type": "text", "title": title, "gridPos": grid, "datasource": None, "options": {"mode": "markdown", "content": content}, } def link_to(uid): return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}] # --------------------------------------------------------------------------- # Dashboard builders # --------------------------------------------------------------------------- def build_overview(): panels = [] row1_stats = [ ( 1, "Workers ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', WORKER_SUFFIX, WORKER_TOTAL, None, ), ( 2, "Control plane ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', CONTROL_SUFFIX, CONTROL_TOTAL, None, ), ( 3, "Control plane workloads", f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', None, 1, link_to("atlas-pods"), ), ( 4, "Problem pods", PROBLEM_PODS_EXPR, None, 1, link_to("atlas-pods"), ), ( 5, "Stuck terminating", STUCK_TERMINATING_EXPR, None, 1, link_to("atlas-pods"), ), (6, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None), ] for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats): thresholds = None if panel_id == 1: thresholds = { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "orange", "value": WORKER_TOTAL - 2}, {"color": "yellow", "value": WORKER_TOTAL - 1}, {"color": "green", "value": WORKER_TOTAL}, ], } elif panel_id == 2: thresholds = { "mode": "absolute", "steps": [ {"color": "red", "value": None}, {"color": "green", "value": CONTROL_TOTAL}, ], } elif panel_id in (3, 4, 5): thresholds = { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 1}, {"color": "orange", "value": 2}, {"color": "red", "value": 3}, ], } panels.append( stat_panel( panel_id, title, expr, {"h": 5, "w": 4, "x": 4 * idx, "y": 0}, value_suffix=suffix, thresholds=thresholds, links=links, ) ) hottest = [ (7, "Hottest node: CPU", node_cpu_expr(), "percent"), (8, "Hottest node: RAM", node_mem_expr(), "percent"), (9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"), (10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"), ] for idx, (panel_id, title, expr, unit) in enumerate(hottest): panels.append( stat_panel( panel_id, title, f"{expr}", {"h": 5, "w": 6, "x": 6 * idx, "y": 5}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, text_mode="name_and_value", legend="{{node}}", instant=True, links=link_to("atlas-nodes"), ) ) panels.append( pie_panel( 11, "Namespace CPU share", NAMESPACE_CPU_EXPR, {"h": 9, "w": 12, "x": 0, "y": 10}, ) ) panels.append( pie_panel( 12, "Namespace RAM share", NAMESPACE_RAM_EXPR, {"h": 9, "w": 12, "x": 12, "y": 10}, ) ) panels.append( timeseries_panel( 13, "Cluster node CPU", node_cpu_expr(), {"h": 8, "w": 12, "x": 0, "y": 19}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", links=link_to("atlas-nodes"), ) ) panels.append( timeseries_panel( 14, "Cluster node RAM", node_mem_expr(), {"h": 8, "w": 12, "x": 12, "y": 19}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", links=link_to("atlas-nodes"), ) ) panels.append( timeseries_panel( 15, "Control plane CPU (incl. titan-db)", node_cpu_expr(CONTROL_ALL_REGEX), {"h": 7, "w": 12, "x": 0, "y": 27}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 16, "Control plane RAM (incl. titan-db)", node_mem_expr(CONTROL_ALL_REGEX), {"h": 7, "w": 12, "x": 12, "y": 27}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 17, "Cluster ingress throughput", NET_INGRESS_EXPR, {"h": 7, "w": 12, "x": 0, "y": 34}, unit="Bps", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), ) ) panels.append( timeseries_panel( 18, "Cluster egress throughput", NET_EGRESS_EXPR, {"h": 7, "w": 12, "x": 12, "y": 34}, unit="Bps", legend_display="list", legend_placement="bottom", links=link_to("atlas-network"), ) ) panels.append( timeseries_panel( 19, "Root filesystem usage", root_usage_expr(), {"h": 8, "w": 12, "x": 0, "y": 41}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", time_from="30d", links=link_to("atlas-storage"), ) ) panels.append( { "id": 20, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": PROM_DS, "gridPos": {"h": 8, "w": 12, "x": 12, "y": 41}, "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A"}], "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, {"color": "orange", "value": 70}, {"color": "red", "value": 85}, ], }, "displayName": "{{node}}", }, "overrides": [], }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, "links": link_to("atlas-storage"), } ) storage_panels = [ (21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"), (22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"), (23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"), (24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"), ] for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): panels.append( stat_panel( panel_id, title, expr, {"h": 6, "w": 6, "x": 6 * idx, "y": 49}, unit=unit, thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, links=link_to("atlas-storage"), ) ) panels.append( text_panel( 25, "About this dashboard", textwrap.dedent( """\ ### Atlas Overview - Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs. - Control plane workload count flags any non-system pods that slipped onto the HA nodes. - Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly.""" ), {"h": 5, "w": 24, "x": 0, "y": 55}, ) ) return { "uid": "atlas-overview", "title": "Atlas Overview", "folderUid": PUBLIC_FOLDER, "editable": False, "annotations": {"list": []}, "panels": panels, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "overview"], "templating": {"list": []}, "time": {"from": "now-12h", "to": "now"}, "links": [ {"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False}, {"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False}, {"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False}, {"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False}, ], } def build_pods_dashboard(): panels = [] panels.append( stat_panel( 1, "Problem pods", PROBLEM_PODS_EXPR, {"h": 4, "w": 6, "x": 0, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": 1}, ], }, ) ) panels.append( stat_panel( 2, "CrashLoop / ImagePull", CRASHLOOP_EXPR, {"h": 4, "w": 6, "x": 6, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": 1}, ], }, ) ) panels.append( stat_panel( 3, "Stuck terminating (>10m)", STUCK_TERMINATING_EXPR, {"h": 4, "w": 6, "x": 12, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": 1}, ], }, ) ) panels.append( stat_panel( 4, "Control plane workloads", f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', {"h": 4, "w": 6, "x": 18, "y": 0}, thresholds={ "mode": "absolute", "steps": [ {"color": "green", "value": None}, {"color": "red", "value": 1}, ], }, ) ) panels.append( table_panel( 5, "Pods not running", PROBLEM_TABLE_EXPR, {"h": 10, "w": 24, "x": 0, "y": 4}, unit="s", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 6, "CrashLoop / ImagePull", CRASHLOOP_TABLE_EXPR, {"h": 10, "w": 24, "x": 0, "y": 14}, unit="s", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 7, "Terminating >10m", STUCK_TABLE_EXPR, {"h": 10, "w": 24, "x": 0, "y": 24}, unit="s", transformations=[ {"id": "labelsToFields", "options": {}}, {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, ], ) ) return { "uid": "atlas-pods", "title": "Atlas Pods", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "pods"], } def build_nodes_dashboard(): panels = [] panels.append( stat_panel( 1, "Worker nodes ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})', {"h": 4, "w": 8, "x": 0, "y": 0}, value_suffix=WORKER_SUFFIX, ) ) panels.append( stat_panel( 2, "Control plane ready", f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})', {"h": 4, "w": 8, "x": 8, "y": 0}, value_suffix=CONTROL_SUFFIX, ) ) panels.append( stat_panel( 3, "Control plane workloads", f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})', {"h": 4, "w": 8, "x": 16, "y": 0}, ) ) panels.append( timeseries_panel( 4, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 4}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 5, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 13}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 6, "Control plane (incl. titan-db) CPU", node_cpu_expr(CONTROL_ALL_REGEX), {"h": 9, "w": 12, "x": 0, "y": 22}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 7, "Control plane (incl. titan-db) RAM", node_mem_expr(CONTROL_ALL_REGEX), {"h": 9, "w": 12, "x": 12, "y": 22}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 8, "Root filesystem usage", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 31}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d", ) ) return { "uid": "atlas-nodes", "title": "Atlas Nodes", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "nodes"], } def build_storage_dashboard(): panels = [] panels.append( stat_panel( 1, "Astreae usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent", thresholds=PERCENT_THRESHOLDS, ) ) panels.append( stat_panel( 2, "Asteria usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent", thresholds=PERCENT_THRESHOLDS, ) ) panels.append( stat_panel( 3, "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="decbytes", ) ) panels.append( stat_panel( 4, "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="decbytes", ) ) panels.append( timeseries_panel( 5, "Astreae per-node usage", filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d", ) ) panels.append( timeseries_panel( 6, "Asteria per-node usage", filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX), {"h": 9, "w": 12, "x": 12, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d", ) ) panels.append( timeseries_panel( 7, "Astreae usage history", astreae_usage_expr("/mnt/astreae"), {"h": 9, "w": 12, "x": 0, "y": 14}, unit="percent", time_from="90d", ) ) panels.append( timeseries_panel( 8, "Asteria usage history", astreae_usage_expr("/mnt/asteria"), {"h": 9, "w": 12, "x": 12, "y": 14}, unit="percent", time_from="90d", ) ) return { "uid": "atlas-storage", "title": "Atlas Storage", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "storage"], } def build_network_dashboard(): panels = [] panels.append( stat_panel(1, "Ingress traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="Bps") ) panels.append( stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps") ) panels.append( stat_panel( 3, "Top router req/s", f"topk(1, {TRAEFIK_ROUTER_EXPR})", {"h": 4, "w": 8, "x": 16, "y": 0}, unit="req/s", legend="{{router}}", instant=True, ) ) panels.append( timeseries_panel( 4, "Per-node throughput", node_net_expr(), {"h": 8, "w": 24, "x": 0, "y": 4}, unit="Bps", legend="{{node}}", legend_display="table", legend_placement="right", ) ) panels.append( table_panel( 5, "Top namespaces", 'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 12}, unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 6, "Top pods", 'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) ' '+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))', {"h": 9, "w": 12, "x": 12, "y": 12}, unit="Bps", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( timeseries_panel( 7, "Traefik routers (req/s)", f"topk(10, {TRAEFIK_ROUTER_EXPR})", {"h": 9, "w": 12, "x": 0, "y": 21}, unit="req/s", legend="{{router}}", legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 8, "Traefik entrypoints (req/s)", 'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))', {"h": 9, "w": 12, "x": 12, "y": 21}, unit="req/s", legend="{{entrypoint}}", legend_display="table", legend_placement="right", ) ) return { "uid": "atlas-network", "title": "Atlas Network", "folderUid": PRIVATE_FOLDER, "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "network"], } DASHBOARDS = { "atlas-overview": { "builder": build_overview, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml", }, "atlas-pods": { "builder": build_pods_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml", }, "atlas-nodes": { "builder": build_nodes_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml", }, "atlas-storage": { "builder": build_storage_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml", }, "atlas-network": { "builder": build_network_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml", }, } def write_json(uid, data): DASHBOARD_DIR.mkdir(parents=True, exist_ok=True) path = DASHBOARD_DIR / f"{uid}.json" path.write_text(json.dumps(data, indent=2) + "\n") def render_configmap(uid, info): json_path = DASHBOARD_DIR / f"{uid}.json" payload = json.dumps(json.loads(json_path.read_text()), indent=2) indented = "\n".join(" " + line for line in payload.splitlines()) output_path = info["configmap"] content = CONFIG_TEMPLATE.format( relative_path=output_path.relative_to(ROOT), name=output_path.stem, key=json_path.name, payload=indented, ) output_path.write_text(content) print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}") def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders") args = parser.parse_args() if args.build: for uid, info in DASHBOARDS.items(): write_json(uid, info["builder"]()) for uid, info in DASHBOARDS.items(): render_configmap(uid, info) if __name__ == "__main__": main()