titan-iac/scripts/render_dashboards.py

606 lines
22 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""Generate Grafana dashboards and render them into ConfigMaps.
Usage:
python scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps
python scripts/render_dashboards.py # just render ConfigMaps
"""
import argparse
import json
import textwrap
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards"
CONFIG_TEMPLATE = textwrap.dedent(
"""# {relative_path}
apiVersion: v1
kind: ConfigMap
metadata:
name: {name}
labels:
grafana_dashboard: "1"
data:
{key}: |
{payload}
"""
)
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
# --------------------------------------------------------------------------- #
# Panel helper factories
# --------------------------------------------------------------------------- #
def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None,
text_mode="value", legend=None):
defaults = {
"color": {"mode": "palette-classic"},
"mappings": [],
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "rgba(115, 115, 115, 1)", "value": None},
{"color": "green", "value": 1},
],
},
"unit": unit,
}
panel = {
"id": panel_id,
"type": "stat",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": defaults, "overrides": []},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"textMode": text_mode,
},
}
if legend:
panel["targets"][0]["legendFormat"] = legend
return panel
def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None,
legend_display="table", legend_placement="bottom",
legend_calcs=None, time_from=None):
panel = {
"id": panel_id,
"type": "timeseries",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
"options": {
"legend": {
"displayMode": legend_display,
"placement": legend_placement,
},
"tooltip": {"mode": "multi"},
},
}
if legend:
panel["targets"][0]["legendFormat"] = legend
if legend_calcs:
panel["options"]["legend"]["calcs"] = legend_calcs
if time_from:
panel["timeFrom"] = time_from
return panel
def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=None,
description=None):
panel = {
"id": panel_id,
"type": "table",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
"options": {"showHeader": True},
}
if transformations:
panel["transformations"] = transformations
if description:
panel["description"] = description
return panel
def pie_panel(panel_id, title, expr, grid):
return {
"id": panel_id,
"type": "piechart",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
"fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
"options": {
"legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
},
}
def text_panel(panel_id, title, content, grid):
return {
"id": panel_id,
"type": "text",
"title": title,
"gridPos": grid,
"datasource": None,
"options": {"mode": "markdown", "content": content},
}
def node_cpu_expr(scope=""):
expr = "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))"
if scope:
expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}"
return expr
def node_mem_expr(scope=""):
expr = "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))"
if scope:
expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}"
return expr
def root_usage_expr():
return "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)"
def astreae_usage_expr(mount):
return (
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
)
def astreae_free_expr(mount):
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
def build_overview():
thresholds_percent = {
"mode": "percentage",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 70},
{"color": "red", "value": 85},
],
}
panels = []
stats = [
(1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})'),
(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})'),
(3, "Control plane ready", 'sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})'),
(4, "Control plane schedulable", 'sum(kube_node_spec_unschedulable{node=~"titan-0a|titan-0b|titan-0c"} == 0)'),
(5, "Problem pods", 'sum(kube_pod_status_phase{phase!~"Running|Succeeded"})'),
(6, "Stuck terminating", 'sum(((time() - kube_pod_deletion_timestamp) > 600))'),
]
for idx, (panel_id, title, expr) in enumerate(stats):
panels.append(
stat_panel(
panel_id,
title,
expr,
{"h": 5, "w": 4, "x": 4 * idx, "y": 0},
)
)
panels.append(
stat_panel(
7,
"Hottest node: CPU",
node_cpu_expr(),
{"h": 5, "w": 4, "x": 24, "y": 0},
unit="percent",
thresholds=thresholds_percent,
text_mode="value_and_name",
legend="{{node}}",
)
)
panels.append(
stat_panel(
8,
"Hottest node: RAM",
node_mem_expr(),
{"h": 5, "w": 4, "x": 28, "y": 0},
unit="percent",
thresholds=thresholds_percent,
text_mode="value_and_name",
legend="{{node}}",
)
)
panels.append(pie_panel(9, "Namespace CPU share", 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 5}))
panels.append(pie_panel(10, "Namespace RAM share", 'topk(10, sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace))', {"h": 9, "w": 12, "x": 12, "y": 5}))
panels.append(
timeseries_panel(
11,
"Cluster node CPU",
node_cpu_expr(),
{"h": 8, "w": 12, "x": 0, "y": 14},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
12,
"Cluster node RAM",
node_mem_expr(),
{"h": 8, "w": 12, "x": 12, "y": 14},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
)
)
panels.append(
table_panel(
13,
"Problem pods (details)",
"(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
{"h": 8, "w": 12, "x": 0, "y": 22},
unit="s",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
14,
"Terminating >10m",
"(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
{"h": 8, "w": 12, "x": 12, "y": 22},
unit="s",
transformations=[
{"id": "labelsToFields", "options": {}} ,
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
],
)
)
panels.append(
timeseries_panel(
15,
"Control plane CPU",
node_cpu_expr("titan-0a|titan-0b|titan-0c"),
{"h": 7, "w": 12, "x": 0, "y": 30},
unit="percent",
legend="{{node}}",
)
)
panels.append(
timeseries_panel(
16,
"Control plane RAM",
node_mem_expr("titan-0a|titan-0b|titan-0c"),
{"h": 7, "w": 12, "x": 12, "y": 30},
unit="percent",
legend="{{node}}",
)
)
panels.append(
timeseries_panel(
17,
"Root filesystem usage",
root_usage_expr(),
{"h": 8, "w": 12, "x": 0, "y": 37},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
time_from="7d",
)
)
panels.append(
{
"id": 18,
"type": "bargauge",
"title": "Nodes closest to full root disks",
"datasource": PROM_DS,
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 37},
"targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "percentage",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 70},
{"color": "red", "value": 85},
],
},
},
"overrides": [],
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
},
}
)
panels.append(
stat_panel(
19,
"Astreae usage",
astreae_usage_expr("/mnt/astreae"),
{"h": 6, "w": 6, "x": 0, "y": 45},
unit="percent",
thresholds=thresholds_percent,
)
)
panels.append(
stat_panel(
20,
"Asteria usage",
astreae_usage_expr("/mnt/asteria"),
{"h": 6, "w": 6, "x": 6, "y": 45},
unit="percent",
thresholds=thresholds_percent,
)
)
panels.append(
stat_panel(
21,
"Astreae free",
astreae_free_expr("/mnt/astreae"),
{"h": 6, "w": 6, "x": 12, "y": 45},
unit="bytesSI",
)
)
panels.append(
stat_panel(
22,
"Asteria free",
astreae_free_expr("/mnt/asteria"),
{"h": 6, "w": 6, "x": 18, "y": 45},
unit="bytesSI",
)
)
panels.append(
table_panel(
23,
"Astreae per-node usage",
'100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)',
{"h": 8, "w": 12, "x": 0, "y": 51},
unit="percent",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
24,
"Asteria per-node usage",
'100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)',
{"h": 8, "w": 12, "x": 12, "y": 51},
unit="percent",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
text_panel(
25,
"About this dashboard",
"### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders",
{"h": 5, "w": 24, "x": 0, "y": 59},
)
)
return {
"uid": "atlas-overview",
"title": "Atlas Overview",
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {"type": "datasource", "uid": "grafana"},
"enable": True,
"hide": True,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard",
}
]
},
"editable": False,
"folderUid": "atlas-overview",
"graphTooltip": 0,
"links": [
{"title": "Pods dashboard", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
{"title": "Nodes dashboard", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
{"title": "Storage dashboard", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
],
"panels": panels,
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "overview"],
"templating": {"list": []},
"time": {"from": "now-12h", "to": "now"},
}
def build_pods_dashboard():
panels = []
panels.append(
table_panel(
1,
"Pods not running",
"(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
{"h": 10, "w": 24, "x": 0, "y": 0},
unit="s",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
2,
"CrashLoop / ImagePull",
"(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
{"h": 10, "w": 24, "x": 0, "y": 10},
unit="s",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
3,
"Terminating pods",
"(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
{"h": 10, "w": 24, "x": 0, "y": 20},
unit="s",
transformations=[
{"id": "labelsToFields", "options": {}} ,
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
],
)
)
return {
"uid": "atlas-pods",
"title": "Atlas Pods",
"folderUid": "atlas-pods",
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "pods"],
}
def build_nodes_dashboard():
panels = []
panels.append(stat_panel(1, "Node count", 'count(kube_node_info)', {"h": 5, "w": 6, "x": 0, "y": 0}))
panels.append(stat_panel(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})', {"h": 5, "w": 6, "x": 6, "y": 0}))
panels.append(stat_panel(3, "Control plane CPU avg", node_cpu_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name"))
panels.append(stat_panel(4, "Control plane RAM avg", node_mem_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name"))
panels.append(timeseries_panel(5, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right"))
panels.append(timeseries_panel(6, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 14}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right"))
panels.append(timeseries_panel(7, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 23}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="7d"))
return {
"uid": "atlas-nodes",
"title": "Atlas Nodes",
"folderUid": "atlas-nodes",
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "nodes"],
}
def build_storage_dashboard():
panels = []
panels.append(stat_panel(1, "Astreae usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent"))
panels.append(stat_panel(2, "Asteria usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent"))
panels.append(stat_panel(3, "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="bytesSI"))
panels.append(stat_panel(4, "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="bytesSI"))
panels.append(timeseries_panel(5, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d"))
panels.append(table_panel(6, "Astreae nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 0, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}]))
panels.append(table_panel(7, "Asteria nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 12, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}]))
return {
"uid": "atlas-storage",
"title": "Atlas Storage",
"folderUid": "atlas-storage",
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "storage"],
}
DASHBOARDS = {
"atlas-overview": {
"builder": build_overview,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml",
},
"atlas-pods": {
"builder": build_pods_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml",
},
"atlas-nodes": {
"builder": build_nodes_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml",
},
"atlas-storage": {
"builder": build_storage_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml",
},
}
def write_json(uid: str, data: dict) -> None:
DASHBOARD_DIR.mkdir(parents=True, exist_ok=True)
path = DASHBOARD_DIR / f"{uid}.json"
path.write_text(json.dumps(data, indent=2) + "\n")
def render_configmap(uid: str, data: dict) -> None:
json_path = DASHBOARD_DIR / f"{uid}.json"
payload = json.dumps(json.loads(json_path.read_text()), indent=2)
indented = "\n".join(" " + line for line in payload.splitlines())
output_path = data["configmap"]
content = CONFIG_TEMPLATE.format(
relative_path=output_path.relative_to(ROOT),
name=output_path.stem,
key=json_path.name,
payload=indented,
)
output_path.write_text(content)
print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}")
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders")
args = parser.parse_args()
if args.build:
for uid, info in DASHBOARDS.items():
write_json(uid, info["builder"]())
for uid, info in DASHBOARDS.items():
render_configmap(uid, info)
if __name__ == "__main__":
main()