#!/usr/bin/env python3 """Generate Grafana dashboards and render them into ConfigMaps. Usage: python scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps python scripts/render_dashboards.py # just render ConfigMaps """ import argparse import json import textwrap from pathlib import Path ROOT = Path(__file__).resolve().parents[1] DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards" CONFIG_TEMPLATE = textwrap.dedent( """# {relative_path} apiVersion: v1 kind: ConfigMap metadata: name: {name} labels: grafana_dashboard: "1" data: {key}: | {payload} """ ) PROM_DS = {"type": "prometheus", "uid": "atlas-vm"} # --------------------------------------------------------------------------- # # Panel helper factories # --------------------------------------------------------------------------- # def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None, text_mode="value", legend=None): defaults = { "color": {"mode": "palette-classic"}, "mappings": [], "thresholds": thresholds or { "mode": "absolute", "steps": [ {"color": "rgba(115, 115, 115, 1)", "value": None}, {"color": "green", "value": 1}, ], }, "unit": unit, } panel = { "id": panel_id, "type": "stat", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A"}], "fieldConfig": {"defaults": defaults, "overrides": []}, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, "textMode": text_mode, }, } if legend: panel["targets"][0]["legendFormat"] = legend return panel def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None, legend_display="table", legend_placement="bottom", legend_calcs=None, time_from=None): panel = { "id": panel_id, "type": "timeseries", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A"}], "fieldConfig": {"defaults": {"unit": unit}, "overrides": []}, "options": { "legend": { "displayMode": legend_display, "placement": legend_placement, }, "tooltip": {"mode": "multi"}, }, } if legend: panel["targets"][0]["legendFormat"] = legend if legend_calcs: panel["options"]["legend"]["calcs"] = legend_calcs if time_from: panel["timeFrom"] = time_from return panel def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=None, description=None): panel = { "id": panel_id, "type": "table", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A"}], "fieldConfig": {"defaults": {"unit": unit}, "overrides": []}, "options": {"showHeader": True}, } if transformations: panel["transformations"] = transformations if description: panel["description"] = description return panel def pie_panel(panel_id, title, expr, grid): return { "id": panel_id, "type": "piechart", "title": title, "datasource": PROM_DS, "gridPos": grid, "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []}, "options": { "legend": {"displayMode": "list", "placement": "right"}, "pieType": "pie", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, } def text_panel(panel_id, title, content, grid): return { "id": panel_id, "type": "text", "title": title, "gridPos": grid, "datasource": None, "options": {"mode": "markdown", "content": content}, } def node_cpu_expr(scope=""): expr = "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))" if scope: expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}" return expr def node_mem_expr(scope=""): expr = "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))" if scope: expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}" return expr def root_usage_expr(): return "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)" def astreae_usage_expr(mount): return ( f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / " f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)" ) def astreae_free_expr(mount): return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" def build_overview(): thresholds_percent = { "mode": "percentage", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}, ], } panels = [] stats = [ (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})'), (2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})'), (3, "Control plane ready", 'sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})'), (4, "Control plane schedulable", 'sum(kube_node_spec_unschedulable{node=~"titan-0a|titan-0b|titan-0c"} == 0)'), (5, "Problem pods", 'sum(kube_pod_status_phase{phase!~"Running|Succeeded"})'), (6, "Stuck terminating", 'sum(((time() - kube_pod_deletion_timestamp) > 600))'), ] for idx, (panel_id, title, expr) in enumerate(stats): panels.append( stat_panel( panel_id, title, expr, {"h": 5, "w": 4, "x": 4 * idx, "y": 0}, ) ) panels.append( stat_panel( 7, "Hottest node: CPU", node_cpu_expr(), {"h": 5, "w": 4, "x": 24, "y": 0}, unit="percent", thresholds=thresholds_percent, text_mode="value_and_name", legend="{{node}}", ) ) panels.append( stat_panel( 8, "Hottest node: RAM", node_mem_expr(), {"h": 5, "w": 4, "x": 28, "y": 0}, unit="percent", thresholds=thresholds_percent, text_mode="value_and_name", legend="{{node}}", ) ) panels.append(pie_panel(9, "Namespace CPU share", 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 5})) panels.append(pie_panel(10, "Namespace RAM share", 'topk(10, sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace))', {"h": 9, "w": 12, "x": 12, "y": 5})) panels.append( timeseries_panel( 11, "Cluster node CPU", node_cpu_expr(), {"h": 8, "w": 12, "x": 0, "y": 14}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", ) ) panels.append( timeseries_panel( 12, "Cluster node RAM", node_mem_expr(), {"h": 8, "w": 12, "x": 12, "y": 14}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", ) ) panels.append( table_panel( 13, "Problem pods (details)", "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", {"h": 8, "w": 12, "x": 0, "y": 22}, unit="s", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 14, "Terminating >10m", "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", {"h": 8, "w": 12, "x": 12, "y": 22}, unit="s", transformations=[ {"id": "labelsToFields", "options": {}} , {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, ], ) ) panels.append( timeseries_panel( 15, "Control plane CPU", node_cpu_expr("titan-0a|titan-0b|titan-0c"), {"h": 7, "w": 12, "x": 0, "y": 30}, unit="percent", legend="{{node}}", ) ) panels.append( timeseries_panel( 16, "Control plane RAM", node_mem_expr("titan-0a|titan-0b|titan-0c"), {"h": 7, "w": 12, "x": 12, "y": 30}, unit="percent", legend="{{node}}", ) ) panels.append( timeseries_panel( 17, "Root filesystem usage", root_usage_expr(), {"h": 8, "w": 12, "x": 0, "y": 37}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right", time_from="7d", ) ) panels.append( { "id": 18, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": PROM_DS, "gridPos": {"h": 8, "w": 12, "x": 12, "y": 37}, "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}], "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "mode": "percentage", "steps": [ {"color": "green", "value": None}, {"color": "yellow", "value": 50}, {"color": "orange", "value": 70}, {"color": "red", "value": 85}, ], }, }, "overrides": [], }, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, }, } ) panels.append( stat_panel( 19, "Astreae usage", astreae_usage_expr("/mnt/astreae"), {"h": 6, "w": 6, "x": 0, "y": 45}, unit="percent", thresholds=thresholds_percent, ) ) panels.append( stat_panel( 20, "Asteria usage", astreae_usage_expr("/mnt/asteria"), {"h": 6, "w": 6, "x": 6, "y": 45}, unit="percent", thresholds=thresholds_percent, ) ) panels.append( stat_panel( 21, "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 6, "w": 6, "x": 12, "y": 45}, unit="bytesSI", ) ) panels.append( stat_panel( 22, "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 6, "w": 6, "x": 18, "y": 45}, unit="bytesSI", ) ) panels.append( table_panel( 23, "Astreae per-node usage", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', {"h": 8, "w": 12, "x": 0, "y": 51}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 24, "Asteria per-node usage", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', {"h": 8, "w": 12, "x": 12, "y": 51}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( text_panel( 25, "About this dashboard", "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders", {"h": 5, "w": 24, "x": 0, "y": 59}, ) ) return { "uid": "atlas-overview", "title": "Atlas Overview", "annotations": { "list": [ { "builtIn": 1, "datasource": {"type": "datasource", "uid": "grafana"}, "enable": True, "hide": True, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard", } ] }, "editable": False, "folderUid": "atlas-overview", "graphTooltip": 0, "links": [ {"title": "Pods dashboard", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False}, {"title": "Nodes dashboard", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False}, {"title": "Storage dashboard", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False}, ], "panels": panels, "refresh": "30s", "schemaVersion": 39, "style": "dark", "tags": ["atlas", "overview"], "templating": {"list": []}, "time": {"from": "now-12h", "to": "now"}, } def build_pods_dashboard(): panels = [] panels.append( table_panel( 1, "Pods not running", "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", {"h": 10, "w": 24, "x": 0, "y": 0}, unit="s", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 2, "CrashLoop / ImagePull", "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", {"h": 10, "w": 24, "x": 0, "y": 10}, unit="s", transformations=[{"id": "labelsToFields", "options": {}}], ) ) panels.append( table_panel( 3, "Terminating pods", "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", {"h": 10, "w": 24, "x": 0, "y": 20}, unit="s", transformations=[ {"id": "labelsToFields", "options": {}} , {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, ], ) ) return { "uid": "atlas-pods", "title": "Atlas Pods", "folderUid": "atlas-pods", "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "pods"], } def build_nodes_dashboard(): panels = [] panels.append(stat_panel(1, "Node count", 'count(kube_node_info)', {"h": 5, "w": 6, "x": 0, "y": 0})) panels.append(stat_panel(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})', {"h": 5, "w": 6, "x": 6, "y": 0})) panels.append(stat_panel(3, "Control plane CPU avg", node_cpu_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name")) panels.append(stat_panel(4, "Control plane RAM avg", node_mem_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name")) panels.append(timeseries_panel(5, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right")) panels.append(timeseries_panel(6, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 14}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right")) panels.append(timeseries_panel(7, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 23}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="7d")) return { "uid": "atlas-nodes", "title": "Atlas Nodes", "folderUid": "atlas-nodes", "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "nodes"], } def build_storage_dashboard(): panels = [] panels.append(stat_panel(1, "Astreae usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent")) panels.append(stat_panel(2, "Asteria usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent")) panels.append(stat_panel(3, "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="bytesSI")) panels.append(stat_panel(4, "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="bytesSI")) panels.append(timeseries_panel(5, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d")) panels.append(table_panel(6, "Astreae nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 0, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}])) panels.append(table_panel(7, "Asteria nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 12, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}])) return { "uid": "atlas-storage", "title": "Atlas Storage", "folderUid": "atlas-storage", "editable": True, "panels": panels, "time": {"from": "now-12h", "to": "now"}, "annotations": {"list": []}, "schemaVersion": 39, "style": "dark", "tags": ["atlas", "storage"], } DASHBOARDS = { "atlas-overview": { "builder": build_overview, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml", }, "atlas-pods": { "builder": build_pods_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml", }, "atlas-nodes": { "builder": build_nodes_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml", }, "atlas-storage": { "builder": build_storage_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml", }, } def write_json(uid: str, data: dict) -> None: DASHBOARD_DIR.mkdir(parents=True, exist_ok=True) path = DASHBOARD_DIR / f"{uid}.json" path.write_text(json.dumps(data, indent=2) + "\n") def render_configmap(uid: str, data: dict) -> None: json_path = DASHBOARD_DIR / f"{uid}.json" payload = json.dumps(json.loads(json_path.read_text()), indent=2) indented = "\n".join(" " + line for line in payload.splitlines()) output_path = data["configmap"] content = CONFIG_TEMPLATE.format( relative_path=output_path.relative_to(ROOT), name=output_path.stem, key=json_path.name, payload=indented, ) output_path.write_text(content) print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}") def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders") args = parser.parse_args() if args.build: for uid, info in DASHBOARDS.items(): write_json(uid, info["builder"]()) for uid, info in DASHBOARDS.items(): render_configmap(uid, info) if __name__ == "__main__": main()