titan-iac/scripts/dashboards_render_atlas.py

1435 lines
41 KiB
Python

#!/usr/bin/env python3
"""Generate Atlas Grafana dashboards and render them into ConfigMaps.
Usage:
scripts/dashboards_render_atlas.py --build # rebuild JSON + ConfigMaps
scripts/dashboards_render_atlas.py # re-render ConfigMaps from JSON
"""
import argparse
import json
import textwrap
from pathlib import Path
# ---------------------------------------------------------------------------
# Paths, folders, and shared metadata
# ---------------------------------------------------------------------------
ROOT = Path(__file__).resolve().parents[1]
DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards"
CONFIG_TEMPLATE = textwrap.dedent(
"""# {relative_path}
apiVersion: v1
kind: ConfigMap
metadata:
name: {name}
labels:
grafana_dashboard: "1"
data:
{key}: |
{payload}
"""
)
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
PUBLIC_FOLDER = "overview"
PRIVATE_FOLDER = "atlas-internal"
PERCENT_THRESHOLDS = {
"mode": "percentage",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 70},
{"color": "red", "value": 85},
],
}
# ---------------------------------------------------------------------------
# Cluster metadata
# ---------------------------------------------------------------------------
CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"]
CONTROL_DEPENDENCIES = ["titan-db"]
CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
WORKER_NODES = [
"titan-04",
"titan-05",
"titan-06",
"titan-07",
"titan-08",
"titan-09",
"titan-10",
"titan-11",
"titan-12",
"titan-13",
"titan-14",
"titan-15",
"titan-16",
"titan-17",
"titan-18",
"titan-19",
"titan-22",
"titan-24",
]
CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES)
CONTROL_ALL_REGEX = "|".join(CONTROL_ALL)
WORKER_REGEX = "|".join(WORKER_NODES)
CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
WORKER_TOTAL = len(WORKER_NODES)
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
GAUGE_WIDTHS = [5, 5, 5, 5, 4]
CONTROL_WORKLOADS_EXPR = (
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
)
# ---------------------------------------------------------------------------
# PromQL helpers
# ---------------------------------------------------------------------------
NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")'
def node_filter(regex):
"""Return a selector that evaluates to 1 for nodes matching the regex."""
return (
f'label_replace(node_uname_info{{nodename=~"{regex}"}}, '
'"node", "$1", "nodename", "(.*)")'
)
def scoped_node_expr(base, scope=""):
"""Attach nodename metadata and optionally filter to a scope regex."""
expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})"
if scope:
expr = f"({expr}) * on(node) group_left() {node_filter(scope)}"
return expr
def node_cpu_expr(scope=""):
idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))'
base = f"(1 - {idle}) * 100"
return scoped_node_expr(base, scope)
def node_mem_expr(scope=""):
usage = (
"avg by (instance) ("
"(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) "
"/ node_memory_MemTotal_bytes * 100)"
)
return scoped_node_expr(usage, scope)
def filesystem_usage_expr(mount, scope=""):
base = (
f'avg by (instance) ('
f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} '
f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)'
)
return scoped_node_expr(base, scope)
def root_usage_expr(scope=""):
return filesystem_usage_expr("/", scope)
def astreae_usage_expr(mount):
return (
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
)
def astreae_free_expr(mount):
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
def topk_with_node(expr):
return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")'
def node_net_expr(scope=""):
base = (
'sum by (instance) ('
'rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))'
)
return scoped_node_expr(base, scope)
def node_io_expr(scope=""):
base = (
"sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
"+ rate(node_disk_written_bytes_total[5m]))"
)
return scoped_node_expr(base, scope)
def namespace_share_expr(resource_expr):
selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
total = f"clamp_min(sum( {selected} ), 1)"
return f"100 * ( {selected} ) / {total}"
def namespace_cpu_share_expr():
return namespace_share_expr(NAMESPACE_CPU_RAW)
def namespace_ram_share_expr():
return namespace_share_expr(NAMESPACE_RAM_RAW)
def namespace_gpu_share_expr():
return namespace_share_expr(NAMESPACE_GPU_RAW)
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
CRASHLOOP_EXPR = (
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
'{reason=~"CrashLoopBackOff|ImagePullBackOff"}))'
)
STUCK_TERMINATING_EXPR = (
'sum(max by (namespace,pod) ('
'((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
'))'
)
PROBLEM_TABLE_EXPR = (
"(time() - kube_pod_created{pod!=\"\"}) "
"* on(namespace,pod) group_left(node) kube_pod_info "
"* on(namespace,pod) group_left(phase) "
"max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
)
CRASHLOOP_TABLE_EXPR = (
"(time() - kube_pod_created{pod!=\"\"}) "
"* on(namespace,pod) group_left(node) kube_pod_info "
"* on(namespace,pod,container) group_left(reason) "
"max by (namespace,pod,container,reason) "
"(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
)
STUCK_TABLE_EXPR = (
"("
"((time() - kube_pod_deletion_timestamp{pod!=\"\"}) "
"and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) "
"* on(namespace,pod) group_left(node) kube_pod_info"
")"
)
NAMESPACE_CPU_RAW = (
'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
)
NAMESPACE_RAM_RAW = (
'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
)
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
GPU_NODE_REGEX = "|".join(GPU_NODES)
NAMESPACE_GPU_ALLOC = (
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
)
NAMESPACE_GPU_USAGE_SHARE = (
'sum by (namespace) (avg_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[1h]))'
)
NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
NAMESPACE_GPU_RAW = (
"("
+ NAMESPACE_GPU_USAGE_SHARE
+ ") or on(namespace) ("
+ NAMESPACE_CPU_RAW
+ " * 0)"
)
NAMESPACE_GPU_WEIGHT = (
"("
+ NAMESPACE_GPU_ALLOC
+ ") or on(namespace) ("
+ NAMESPACE_CPU_RAW
+ " * 0)"
)
NAMESPACE_ACTIVITY_SCORE = (
"( "
+ NAMESPACE_CPU_RAW
+ " ) + ("
+ NAMESPACE_RAM_RAW
+ " / 1e9) + ("
+ NAMESPACE_GPU_WEIGHT
+ " * 100)"
)
NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
TRAEFIK_NET_INGRESS = (
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
" or on() vector(0)"
)
TRAEFIK_NET_EGRESS = (
'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
" or on() vector(0)"
)
NET_CLUSTER_RX = (
'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
" or on() vector(0)"
)
NET_CLUSTER_TX = (
'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
" or on() vector(0)"
)
PHYSICAL_NET_FILTER = 'device!~"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*"'
NET_NODE_RX_PHYS = (
f'sum(rate(node_network_receive_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
)
NET_NODE_TX_PHYS = (
f'sum(rate(node_network_transmit_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
)
NET_TOTAL_EXPR = NET_NODE_TX_PHYS
NET_INGRESS_EXPR = NET_NODE_RX_PHYS
NET_EGRESS_EXPR = NET_NODE_TX_PHYS
NET_INTERNAL_EXPR = (
'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!=""}[5m]) '
'+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
' or on() vector(0)'
)
# ---------------------------------------------------------------------------
# Panel factories
# ---------------------------------------------------------------------------
def stat_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
thresholds=None,
text_mode="value",
legend=None,
instant=False,
value_suffix=None,
links=None,
):
"""Return a Grafana stat panel definition."""
defaults = {
"color": {"mode": "palette-classic"},
"mappings": [],
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "rgba(115, 115, 115, 1)", "value": None},
{"color": "green", "value": 1},
],
},
"unit": unit,
"custom": {"displayMode": "auto"},
}
if value_suffix:
defaults["custom"]["valueSuffix"] = value_suffix
panel = {
"id": panel_id,
"type": "stat",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": defaults, "overrides": []},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"textMode": text_mode,
},
}
if legend:
panel["targets"][0]["legendFormat"] = legend
if instant:
panel["targets"][0]["instant"] = True
if links:
panel["links"] = links
return panel
def gauge_panel(
panel_id,
title,
expr,
grid,
*,
min_value=0,
max_value=1,
thresholds=None,
links=None,
):
return {
"id": panel_id,
"type": "gauge",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {
"defaults": {
"min": min_value,
"max": max_value,
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": max_value},
],
},
},
"overrides": [],
},
"options": {
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"orientation": "auto",
"showThresholdMarkers": False,
"showThresholdLabels": False,
},
**({"links": links} if links else {}),
}
def timeseries_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
legend=None,
legend_display="table",
legend_placement="bottom",
legend_calcs=None,
time_from=None,
links=None,
):
"""Return a Grafana time-series panel definition."""
panel = {
"id": panel_id,
"type": "timeseries",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
"options": {
"legend": {
"displayMode": legend_display,
"placement": legend_placement,
},
"tooltip": {"mode": "multi"},
},
}
if legend:
panel["targets"][0]["legendFormat"] = legend
if legend_calcs:
panel["options"]["legend"]["calcs"] = legend_calcs
if time_from:
panel["timeFrom"] = time_from
if links:
panel["links"] = links
return panel
def table_panel(
panel_id,
title,
expr,
grid,
*,
unit="none",
transformations=None,
):
"""Return a Grafana table panel definition."""
panel = {
"id": panel_id,
"type": "table",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
"options": {"showHeader": True},
}
if transformations:
panel["transformations"] = transformations
return panel
def pie_panel(panel_id, title, expr, grid):
"""Return a pie chart panel with readable namespace labels."""
return {
"id": panel_id,
"type": "piechart",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
"fieldConfig": {
"defaults": {
"unit": "percent",
"color": {"mode": "palette-classic"},
},
"overrides": [],
},
"options": {
"legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie",
"displayLabels": ["percent"],
"tooltip": {"mode": "single"},
"colorScheme": "interpolateSpectral",
"colorBy": "value",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
},
}
def bargauge_panel(panel_id, title, expr, grid, *, unit="none", links=None):
"""Return a bar gauge panel with label-aware reduction."""
panel = {
"id": panel_id,
"type": "bargauge",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{node}}"}],
"fieldConfig": {
"defaults": {
"unit": unit,
"min": 0,
"max": 100 if unit == "percent" else None,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 70},
{"color": "red", "value": 85},
],
},
},
"overrides": [],
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": False,
},
},
}
if links:
panel["links"] = links
return panel
def text_panel(panel_id, title, content, grid):
return {
"id": panel_id,
"type": "text",
"title": title,
"gridPos": grid,
"datasource": None,
"options": {"mode": "markdown", "content": content},
}
def link_to(uid):
return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}]
# ---------------------------------------------------------------------------
# Dashboard builders
# ---------------------------------------------------------------------------
def build_overview():
panels = []
row1_stats = [
(
1,
"Workers Ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
WORKER_SUFFIX,
WORKER_TOTAL,
None,
),
(
2,
"Control Plane Ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
CONTROL_SUFFIX,
CONTROL_TOTAL,
None,
),
(
3,
"Control Plane Workloads",
CONTROL_WORKLOADS_EXPR,
None,
4,
link_to("atlas-pods"),
),
(
4,
"Problem Pods",
PROBLEM_PODS_EXPR,
None,
1,
link_to("atlas-pods"),
),
(
5,
"Stuck Terminating",
STUCK_TERMINATING_EXPR,
None,
1,
link_to("atlas-pods"),
),
]
def gauge_grid(idx):
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
x = sum(GAUGE_WIDTHS[:idx])
return width, x
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
thresholds = None
min_value = 0
max_value = ok_value or 5
if panel_id == 1:
max_value = WORKER_TOTAL
thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": WORKER_TOTAL - 2},
{"color": "yellow", "value": WORKER_TOTAL - 1},
{"color": "green", "value": WORKER_TOTAL},
],
}
elif panel_id == 2:
max_value = CONTROL_TOTAL
thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "green", "value": CONTROL_TOTAL},
],
}
elif panel_id in (3, 4, 5):
max_value = 4
thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 2},
{"color": "red", "value": 3},
],
}
else:
thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": max_value},
],
}
width, x = gauge_grid(idx)
if panel_id in (3, 4, 5):
panels.append(
stat_panel(
panel_id,
title,
expr,
{"h": 5, "w": width, "x": x, "y": 0},
thresholds=thresholds,
legend=None,
links=links,
text_mode="value",
)
)
else:
panels.append(
gauge_panel(
panel_id,
title,
expr,
{"h": 5, "w": width, "x": x, "y": 0},
min_value=min_value,
max_value=max_value,
thresholds=thresholds,
links=links,
)
)
hottest = [
(7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"),
(8, "Hottest node: RAM", topk_with_node(node_mem_expr()), "percent"),
(9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"),
(10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"),
]
for idx, (panel_id, title, expr, unit) in enumerate(hottest):
panels.append(
stat_panel(
panel_id,
title,
f"{expr}",
{"h": 3, "w": 6, "x": 6 * idx, "y": 5},
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
text_mode="name_and_value",
legend="{{node}}",
instant=True,
links=link_to("atlas-nodes"),
)
)
storage_panels = [
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
(24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
(25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"),
(26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"),
]
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
panels.append(
stat_panel(
panel_id,
title,
expr,
{"h": 6, "w": 6, "x": 6 * idx, "y": 10},
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
links=link_to("atlas-storage"),
)
)
panels.append(
pie_panel(
11,
"Namespace CPU Share",
namespace_cpu_share_expr(),
{"h": 9, "w": 8, "x": 0, "y": 16},
)
)
panels.append(
pie_panel(
12,
"Namespace GPU Share",
namespace_gpu_share_expr(),
{"h": 9, "w": 8, "x": 8, "y": 16},
)
)
panels.append(
pie_panel(
13,
"Namespace RAM Share",
namespace_ram_share_expr(),
{"h": 9, "w": 8, "x": 16, "y": 16},
)
)
worker_filter = f"{WORKER_REGEX}"
panels.append(
timeseries_panel(
14,
"Worker Node CPU",
node_cpu_expr(worker_filter),
{"h": 8, "w": 12, "x": 0, "y": 32},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
links=link_to("atlas-nodes"),
)
)
panels.append(
timeseries_panel(
15,
"Worker Node RAM",
node_mem_expr(worker_filter),
{"h": 8, "w": 12, "x": 12, "y": 32},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
links=link_to("atlas-nodes"),
)
)
panels.append(
timeseries_panel(
16,
"Control plane CPU",
node_cpu_expr(CONTROL_REGEX),
{"h": 7, "w": 12, "x": 0, "y": 40},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
17,
"Control plane RAM",
node_mem_expr(CONTROL_REGEX),
{"h": 7, "w": 12, "x": 12, "y": 40},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
18,
"Cluster Ingress Throughput",
NET_INGRESS_EXPR,
{"h": 7, "w": 8, "x": 0, "y": 25},
unit="Bps",
legend="Ingress (Traefik)",
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-network"),
)
)
panels.append(
timeseries_panel(
19,
"Cluster Egress Throughput",
NET_EGRESS_EXPR,
{"h": 7, "w": 8, "x": 8, "y": 25},
unit="Bps",
legend="Egress (Traefik)",
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-network"),
)
)
panels.append(
timeseries_panel(
20,
"Intra-Cluster Throughput",
NET_INTERNAL_EXPR,
{"h": 7, "w": 8, "x": 16, "y": 25},
unit="Bps",
legend="Internal traffic",
legend_display="list",
legend_placement="bottom",
links=link_to("atlas-network"),
)
)
panels.append(
timeseries_panel(
21,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 8, "w": 12, "x": 0, "y": 47},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
time_from="30d",
links=link_to("atlas-storage"),
)
)
panels.append(
bargauge_panel(
22,
"Nodes Closest to Full Root Disks",
f"topk(8, {root_usage_expr()})",
{"h": 8, "w": 12, "x": 12, "y": 47},
unit="percent",
links=link_to("atlas-storage"),
)
)
return {
"uid": "atlas-overview",
"title": "Atlas Overview",
"folderUid": PUBLIC_FOLDER,
"editable": False,
"annotations": {"list": []},
"panels": panels,
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "overview"],
"templating": {"list": []},
"time": {"from": "now-1h", "to": "now"},
"refresh": "1m",
"links": [
{"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
{"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
{"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
{"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False},
{"title": "Atlas GPU", "type": "dashboard", "dashboardUid": "atlas-gpu", "keepTime": False},
],
}
def build_pods_dashboard():
panels = []
panels.append(
stat_panel(
1,
"Problem Pods",
PROBLEM_PODS_EXPR,
{"h": 4, "w": 6, "x": 0, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
)
)
panels.append(
stat_panel(
2,
"CrashLoop / ImagePull",
CRASHLOOP_EXPR,
{"h": 4, "w": 6, "x": 6, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
)
)
panels.append(
stat_panel(
3,
"Stuck Terminating (>10m)",
STUCK_TERMINATING_EXPR,
{"h": 4, "w": 6, "x": 12, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
)
)
panels.append(
stat_panel(
4,
"Control Plane Workloads",
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
{"h": 4, "w": 6, "x": 18, "y": 0},
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "red", "value": 1},
],
},
)
)
panels.append(
table_panel(
5,
"Pods Not Running",
PROBLEM_TABLE_EXPR,
{"h": 10, "w": 24, "x": 0, "y": 4},
unit="s",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
6,
"CrashLoop / ImagePull",
CRASHLOOP_TABLE_EXPR,
{"h": 10, "w": 24, "x": 0, "y": 14},
unit="s",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
7,
"Terminating >10m",
STUCK_TABLE_EXPR,
{"h": 10, "w": 24, "x": 0, "y": 24},
unit="s",
transformations=[
{"id": "labelsToFields", "options": {}},
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
],
)
)
return {
"uid": "atlas-pods",
"title": "Atlas Pods",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "pods"],
}
def build_nodes_dashboard():
panels = []
panels.append(
stat_panel(
1,
"Worker Nodes Ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
{"h": 4, "w": 8, "x": 0, "y": 0},
value_suffix=WORKER_SUFFIX,
)
)
panels.append(
stat_panel(
2,
"Control Plane Ready",
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
{"h": 4, "w": 8, "x": 8, "y": 0},
value_suffix=CONTROL_SUFFIX,
)
)
panels.append(
stat_panel(
3,
"Control Plane Workloads",
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
{"h": 4, "w": 8, "x": 16, "y": 0},
)
)
panels.append(
timeseries_panel(
4,
"Node CPU",
node_cpu_expr(),
{"h": 9, "w": 24, "x": 0, "y": 4},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
5,
"Node RAM",
node_mem_expr(),
{"h": 9, "w": 24, "x": 0, "y": 13},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
6,
"Control Plane (incl. titan-db) CPU",
node_cpu_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 0, "y": 22},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
7,
"Control Plane (incl. titan-db) RAM",
node_mem_expr(CONTROL_ALL_REGEX),
{"h": 9, "w": 12, "x": 12, "y": 22},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
8,
"Root Filesystem Usage",
root_usage_expr(),
{"h": 9, "w": 24, "x": 0, "y": 31},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
return {
"uid": "atlas-nodes",
"title": "Atlas Nodes",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "nodes"],
}
def build_storage_dashboard():
panels = []
panels.append(
stat_panel(
1,
"Astreae Usage",
astreae_usage_expr("/mnt/astreae"),
{"h": 5, "w": 6, "x": 0, "y": 0},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
)
)
panels.append(
stat_panel(
2,
"Asteria Usage",
astreae_usage_expr("/mnt/asteria"),
{"h": 5, "w": 6, "x": 6, "y": 0},
unit="percent",
thresholds=PERCENT_THRESHOLDS,
)
)
panels.append(
stat_panel(
3,
"Astreae Free",
astreae_free_expr("/mnt/astreae"),
{"h": 5, "w": 6, "x": 12, "y": 0},
unit="decbytes",
)
)
panels.append(
stat_panel(
4,
"Asteria Free",
astreae_free_expr("/mnt/asteria"),
{"h": 5, "w": 6, "x": 18, "y": 0},
unit="decbytes",
)
)
panels.append(
timeseries_panel(
5,
"Astreae Per-Node Usage",
filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX),
{"h": 9, "w": 12, "x": 0, "y": 5},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
panels.append(
timeseries_panel(
6,
"Asteria Per-Node Usage",
filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX),
{"h": 9, "w": 12, "x": 12, "y": 5},
unit="percent",
legend="{{node}}",
legend_display="table",
legend_placement="right",
time_from="30d",
)
)
panels.append(
timeseries_panel(
7,
"Astreae Usage History",
astreae_usage_expr("/mnt/astreae"),
{"h": 9, "w": 12, "x": 0, "y": 14},
unit="percent",
time_from="90d",
)
)
panels.append(
timeseries_panel(
8,
"Asteria Usage History",
astreae_usage_expr("/mnt/asteria"),
{"h": 9, "w": 12, "x": 12, "y": 14},
unit="percent",
time_from="90d",
)
)
return {
"uid": "atlas-storage",
"title": "Atlas Storage",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "storage"],
}
def build_network_dashboard():
panels = []
panels.append(
stat_panel(
1,
"Ingress Traffic",
NET_INGRESS_EXPR,
{"h": 4, "w": 8, "x": 0, "y": 0},
unit="Bps",
)
)
panels.append(
stat_panel(
2,
"Egress Traffic",
NET_EGRESS_EXPR,
{"h": 4, "w": 8, "x": 8, "y": 0},
unit="Bps",
)
)
panels.append(
stat_panel(
3,
"Intra-Cluster Traffic",
NET_INTERNAL_EXPR,
{"h": 4, "w": 8, "x": 16, "y": 0},
unit="Bps",
)
)
panels.append(
stat_panel(
4,
"Top Router req/s",
f"topk(1, {TRAEFIK_ROUTER_EXPR})",
{"h": 4, "w": 8, "x": 0, "y": 4},
unit="req/s",
legend="{{router}}",
)
)
panels.append(
timeseries_panel(
5,
"Per-Node Throughput",
f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
{"h": 8, "w": 24, "x": 0, "y": 8},
unit="Bps",
legend="{{node}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
table_panel(
6,
"Top Namespaces",
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
{"h": 9, "w": 12, "x": 0, "y": 16},
unit="Bps",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
7,
"Top Pods",
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
{"h": 9, "w": 12, "x": 12, "y": 16},
unit="Bps",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
timeseries_panel(
8,
"Traefik Routers (req/s)",
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
{"h": 9, "w": 12, "x": 0, "y": 25},
unit="req/s",
legend="{{router}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
9,
"Traefik Entrypoints (req/s)",
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
{"h": 9, "w": 12, "x": 12, "y": 25},
unit="req/s",
legend="{{entrypoint}}",
legend_display="table",
legend_placement="right",
)
)
return {
"uid": "atlas-network",
"title": "Atlas Network",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "network"],
}
def build_gpu_dashboard():
panels = []
panels.append(
pie_panel(
1,
"Namespace GPU Share",
namespace_gpu_share_expr(),
{"h": 8, "w": 12, "x": 0, "y": 0},
)
)
panels.append(
timeseries_panel(
2,
"GPU Util by Namespace",
NAMESPACE_GPU_USAGE_INSTANT,
{"h": 8, "w": 12, "x": 12, "y": 0},
unit="percent",
legend="{{namespace}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
3,
"GPU Util by Node",
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
{"h": 8, "w": 12, "x": 0, "y": 8},
unit="percent",
legend="{{Hostname}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
table_panel(
4,
"Top Pods by GPU Util",
'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
{"h": 8, "w": 12, "x": 12, "y": 8},
unit="percent",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
return {
"uid": "atlas-gpu",
"title": "Atlas GPU",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "gpu"],
}
DASHBOARDS = {
"atlas-overview": {
"builder": build_overview,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml",
},
"atlas-pods": {
"builder": build_pods_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml",
},
"atlas-nodes": {
"builder": build_nodes_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml",
},
"atlas-storage": {
"builder": build_storage_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml",
},
"atlas-network": {
"builder": build_network_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
},
"atlas-gpu": {
"builder": build_gpu_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
},
}
def write_json(uid, data):
DASHBOARD_DIR.mkdir(parents=True, exist_ok=True)
path = DASHBOARD_DIR / f"{uid}.json"
path.write_text(json.dumps(data, indent=2) + "\n")
def render_configmap(uid, info):
json_path = DASHBOARD_DIR / f"{uid}.json"
payload = json.dumps(json.loads(json_path.read_text()), indent=2)
indented = "\n".join(" " + line for line in payload.splitlines())
output_path = info["configmap"]
content = CONFIG_TEMPLATE.format(
relative_path=output_path.relative_to(ROOT),
name=output_path.stem,
key=json_path.name,
payload=indented,
)
output_path.write_text(content)
print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}")
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders")
args = parser.parse_args()
if args.build:
for uid, info in DASHBOARDS.items():
write_json(uid, info["builder"]())
for uid, info in DASHBOARDS.items():
render_configmap(uid, info)
if __name__ == "__main__":
main()