1144 lines
33 KiB
Python
1144 lines
33 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate Atlas Grafana dashboards and render them into ConfigMaps.
|
|
|
|
Usage:
|
|
scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps
|
|
scripts/render_dashboards.py # re-render ConfigMaps from JSON
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import textwrap
|
|
from pathlib import Path
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Paths, folders, and shared metadata
|
|
# ---------------------------------------------------------------------------
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards"
|
|
CONFIG_TEMPLATE = textwrap.dedent(
|
|
"""# {relative_path}
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: {name}
|
|
labels:
|
|
grafana_dashboard: "1"
|
|
data:
|
|
{key}: |
|
|
{payload}
|
|
"""
|
|
)
|
|
|
|
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
|
|
PUBLIC_FOLDER = "atlas-overview"
|
|
PRIVATE_FOLDER = "atlas-internal"
|
|
|
|
PERCENT_THRESHOLDS = {
|
|
"mode": "percentage",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 70},
|
|
{"color": "red", "value": 85},
|
|
],
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cluster metadata
|
|
# ---------------------------------------------------------------------------
|
|
|
|
CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"]
|
|
CONTROL_DEPENDENCIES = ["titan-db"]
|
|
CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
|
|
WORKER_NODES = [
|
|
"titan-04",
|
|
"titan-05",
|
|
"titan-06",
|
|
"titan-07",
|
|
"titan-08",
|
|
"titan-09",
|
|
"titan-10",
|
|
"titan-11",
|
|
"titan-12",
|
|
"titan-13",
|
|
"titan-14",
|
|
"titan-15",
|
|
"titan-16",
|
|
"titan-17",
|
|
"titan-18",
|
|
"titan-19",
|
|
"titan-22",
|
|
"titan-24",
|
|
]
|
|
|
|
CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES)
|
|
CONTROL_ALL_REGEX = "|".join(CONTROL_ALL)
|
|
WORKER_REGEX = "|".join(WORKER_NODES)
|
|
CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
|
|
WORKER_TOTAL = len(WORKER_NODES)
|
|
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
|
|
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
|
|
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring"
|
|
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# PromQL helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")'
|
|
|
|
|
|
def node_filter(regex):
|
|
"""Return a selector that evaluates to 1 for nodes matching the regex."""
|
|
return (
|
|
f'label_replace(node_uname_info{{nodename=~"{regex}"}}, '
|
|
'"node", "$1", "nodename", "(.*)")'
|
|
)
|
|
|
|
|
|
def scoped_node_expr(base, scope=""):
|
|
"""Attach nodename metadata and optionally filter to a scope regex."""
|
|
expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})"
|
|
if scope:
|
|
expr = f"({expr}) * on(node) group_left() {node_filter(scope)}"
|
|
return expr
|
|
|
|
|
|
def node_cpu_expr(scope=""):
|
|
idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))'
|
|
base = f"(1 - {idle}) * 100"
|
|
return scoped_node_expr(base, scope)
|
|
|
|
|
|
def node_mem_expr(scope=""):
|
|
usage = (
|
|
"avg by (instance) ("
|
|
"(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) "
|
|
"/ node_memory_MemTotal_bytes * 100)"
|
|
)
|
|
return scoped_node_expr(usage, scope)
|
|
|
|
|
|
def filesystem_usage_expr(mount, scope=""):
|
|
base = (
|
|
f'avg by (instance) ('
|
|
f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} '
|
|
f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)'
|
|
)
|
|
return scoped_node_expr(base, scope)
|
|
|
|
|
|
def root_usage_expr(scope=""):
|
|
return filesystem_usage_expr("/", scope)
|
|
|
|
|
|
def astreae_usage_expr(mount):
|
|
return (
|
|
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
|
|
f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
|
|
)
|
|
|
|
|
|
def astreae_free_expr(mount):
|
|
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
|
|
|
|
|
|
PROBLEM_PODS_EXPR = 'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}))'
|
|
CRASHLOOP_EXPR = (
|
|
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
|
|
'{reason=~"CrashLoopBackOff|ImagePullBackOff"}))'
|
|
)
|
|
STUCK_TERMINATING_EXPR = (
|
|
'sum(max by (namespace,pod) ('
|
|
'((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
|
|
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
|
|
'))'
|
|
)
|
|
|
|
PROBLEM_TABLE_EXPR = (
|
|
"(time() - kube_pod_created{pod!=\"\"}) "
|
|
"* on(namespace,pod) group_left(node) kube_pod_info "
|
|
"* on(namespace,pod) group_left(phase) "
|
|
"max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
|
|
)
|
|
CRASHLOOP_TABLE_EXPR = (
|
|
"(time() - kube_pod_created{pod!=\"\"}) "
|
|
"* on(namespace,pod) group_left(node) kube_pod_info "
|
|
"* on(namespace,pod,container) group_left(reason) "
|
|
"max by (namespace,pod,container,reason) "
|
|
"(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
|
|
)
|
|
STUCK_TABLE_EXPR = (
|
|
"("
|
|
"((time() - kube_pod_deletion_timestamp{pod!=\"\"}) "
|
|
"and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) "
|
|
"* on(namespace,pod) group_left(node) kube_pod_info"
|
|
")"
|
|
)
|
|
|
|
NAMESPACE_CPU_EXPR = (
|
|
'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=""'
|
|
',pod!=""}[5m])) by (namespace))'
|
|
)
|
|
NAMESPACE_RAM_EXPR = (
|
|
'topk(10, sum(container_memory_working_set_bytes{namespace!=""'
|
|
',pod!=""}) by (namespace))'
|
|
)
|
|
NET_SERIES_EXPR = (
|
|
'avg by (node) ('
|
|
'rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m]) '
|
|
'+ rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m]))'
|
|
)
|
|
NET_TOP_EXPR = f"topk(1, {NET_SERIES_EXPR})"
|
|
IO_SERIES_EXPR = (
|
|
"avg by (node) (rate(node_disk_read_bytes_total[5m]) "
|
|
"+ rate(node_disk_written_bytes_total[5m]))"
|
|
)
|
|
IO_TOP_EXPR = f"topk(1, {IO_SERIES_EXPR})"
|
|
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
|
NET_INGRESS_EXPR = (
|
|
'sum(rate(container_network_receive_bytes_total{namespace!="" ,pod!=""}[5m])) '
|
|
"or on() vector(0)"
|
|
)
|
|
NET_EGRESS_EXPR = (
|
|
'sum(rate(container_network_transmit_bytes_total{namespace!="" ,pod!=""}[5m])) '
|
|
"or on() vector(0)"
|
|
)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Panel factories
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def stat_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
unit="none",
|
|
thresholds=None,
|
|
text_mode="value",
|
|
legend=None,
|
|
display_name=None,
|
|
value_suffix=None,
|
|
links=None,
|
|
):
|
|
"""Return a Grafana stat panel definition."""
|
|
defaults = {
|
|
"color": {"mode": "palette-classic"},
|
|
"mappings": [],
|
|
"thresholds": thresholds
|
|
or {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "rgba(115, 115, 115, 1)", "value": None},
|
|
{"color": "green", "value": 1},
|
|
],
|
|
},
|
|
"unit": unit,
|
|
"custom": {"displayMode": "auto"},
|
|
}
|
|
if value_suffix:
|
|
defaults["custom"]["valueSuffix"] = value_suffix
|
|
if display_name:
|
|
defaults["displayName"] = display_name
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "stat",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [{"expr": expr, "refId": "A"}],
|
|
"fieldConfig": {"defaults": defaults, "overrides": []},
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "area",
|
|
"justifyMode": "center",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
|
"textMode": text_mode,
|
|
},
|
|
}
|
|
if legend:
|
|
panel["targets"][0]["legendFormat"] = legend
|
|
if links:
|
|
panel["links"] = links
|
|
return panel
|
|
|
|
|
|
def timeseries_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
unit="none",
|
|
legend=None,
|
|
legend_display="table",
|
|
legend_placement="bottom",
|
|
legend_calcs=None,
|
|
time_from=None,
|
|
links=None,
|
|
):
|
|
"""Return a Grafana time-series panel definition."""
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "timeseries",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [{"expr": expr, "refId": "A"}],
|
|
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
|
|
"options": {
|
|
"legend": {
|
|
"displayMode": legend_display,
|
|
"placement": legend_placement,
|
|
},
|
|
"tooltip": {"mode": "multi"},
|
|
},
|
|
}
|
|
if legend:
|
|
panel["targets"][0]["legendFormat"] = legend
|
|
if legend_calcs:
|
|
panel["options"]["legend"]["calcs"] = legend_calcs
|
|
if time_from:
|
|
panel["timeFrom"] = time_from
|
|
if links:
|
|
panel["links"] = links
|
|
return panel
|
|
|
|
|
|
def table_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
unit="none",
|
|
transformations=None,
|
|
):
|
|
"""Return a Grafana table panel definition."""
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "table",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [{"expr": expr, "refId": "A"}],
|
|
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
|
|
"options": {"showHeader": True},
|
|
}
|
|
if transformations:
|
|
panel["transformations"] = transformations
|
|
return panel
|
|
|
|
|
|
def pie_panel(panel_id, title, expr, grid):
|
|
"""Return a pie chart panel with readable namespace labels."""
|
|
return {
|
|
"id": panel_id,
|
|
"type": "piechart",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [{"expr": expr, "refId": "A"}],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"displayName": "{{namespace}}",
|
|
},
|
|
"overrides": [],
|
|
},
|
|
"options": {
|
|
"legend": {"displayMode": "list", "placement": "right"},
|
|
"pieType": "pie",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
|
},
|
|
}
|
|
|
|
|
|
def text_panel(panel_id, title, content, grid):
|
|
return {
|
|
"id": panel_id,
|
|
"type": "text",
|
|
"title": title,
|
|
"gridPos": grid,
|
|
"datasource": None,
|
|
"options": {"mode": "markdown", "content": content},
|
|
}
|
|
|
|
|
|
def link_to(uid):
|
|
return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Dashboard builders
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def build_overview():
|
|
panels = []
|
|
|
|
row1_stats = [
|
|
(1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})', None, None, None),
|
|
(
|
|
2,
|
|
"Ready nodes",
|
|
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
|
|
WORKER_SUFFIX,
|
|
WORKER_TOTAL,
|
|
None,
|
|
),
|
|
(
|
|
3,
|
|
"Control plane ready",
|
|
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
|
|
CONTROL_SUFFIX,
|
|
CONTROL_TOTAL,
|
|
None,
|
|
),
|
|
(
|
|
4,
|
|
"Control plane workloads",
|
|
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
|
|
None,
|
|
1,
|
|
link_to("atlas-pods"),
|
|
),
|
|
(
|
|
5,
|
|
"Problem pods",
|
|
PROBLEM_PODS_EXPR,
|
|
None,
|
|
1,
|
|
link_to("atlas-pods"),
|
|
),
|
|
(
|
|
6,
|
|
"Stuck terminating",
|
|
STUCK_TERMINATING_EXPR,
|
|
None,
|
|
1,
|
|
link_to("atlas-pods"),
|
|
),
|
|
]
|
|
for idx, (panel_id, title, expr, suffix, ok_value, links) in enumerate(row1_stats):
|
|
thresholds = None
|
|
if panel_id in (2, 3):
|
|
thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "green", "value": ok_value},
|
|
],
|
|
}
|
|
elif panel_id >= 4:
|
|
thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
}
|
|
panels.append(
|
|
stat_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
{"h": 5, "w": 4, "x": 4 * idx, "y": 0},
|
|
value_suffix=suffix,
|
|
thresholds=thresholds,
|
|
links=links,
|
|
)
|
|
)
|
|
|
|
hottest = [
|
|
(7, "Hottest node: CPU", f"topk(1, {node_cpu_expr()})", "percent"),
|
|
(8, "Hottest node: RAM", f"topk(1, {node_mem_expr()})", "percent"),
|
|
(9, "Hottest node: NET", NET_TOP_EXPR, "Bps"),
|
|
(10, "Hottest node: I/O", IO_TOP_EXPR, "Bps"),
|
|
]
|
|
for idx, (panel_id, title, expr, unit) in enumerate(hottest):
|
|
panels.append(
|
|
stat_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
{"h": 5, "w": 6, "x": 6 * idx, "y": 5},
|
|
unit=unit,
|
|
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
|
text_mode="value_and_name",
|
|
display_name="{{node}}",
|
|
links=link_to("atlas-nodes"),
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
pie_panel(
|
|
11,
|
|
"Namespace CPU share",
|
|
NAMESPACE_CPU_EXPR,
|
|
{"h": 9, "w": 12, "x": 0, "y": 10},
|
|
)
|
|
)
|
|
panels.append(
|
|
pie_panel(
|
|
12,
|
|
"Namespace RAM share",
|
|
NAMESPACE_RAM_EXPR,
|
|
{"h": 9, "w": 12, "x": 12, "y": 10},
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
timeseries_panel(
|
|
13,
|
|
"Cluster node CPU",
|
|
node_cpu_expr(),
|
|
{"h": 8, "w": 12, "x": 0, "y": 19},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
links=link_to("atlas-nodes"),
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
14,
|
|
"Cluster node RAM",
|
|
node_mem_expr(),
|
|
{"h": 8, "w": 12, "x": 12, "y": 19},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
links=link_to("atlas-nodes"),
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
timeseries_panel(
|
|
15,
|
|
"Control plane CPU (incl. titan-db)",
|
|
node_cpu_expr(CONTROL_ALL_REGEX),
|
|
{"h": 7, "w": 12, "x": 0, "y": 27},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
16,
|
|
"Control plane RAM (incl. titan-db)",
|
|
node_mem_expr(CONTROL_ALL_REGEX),
|
|
{"h": 7, "w": 12, "x": 12, "y": 27},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
timeseries_panel(
|
|
17,
|
|
"Cluster ingress throughput",
|
|
NET_INGRESS_EXPR,
|
|
{"h": 7, "w": 12, "x": 0, "y": 34},
|
|
unit="Bps",
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
links=link_to("atlas-network"),
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
18,
|
|
"Cluster egress throughput",
|
|
NET_EGRESS_EXPR,
|
|
{"h": 7, "w": 12, "x": 12, "y": 34},
|
|
unit="Bps",
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
links=link_to("atlas-network"),
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
timeseries_panel(
|
|
19,
|
|
"Root filesystem usage",
|
|
root_usage_expr(),
|
|
{"h": 8, "w": 12, "x": 0, "y": 41},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
links=link_to("atlas-storage"),
|
|
)
|
|
)
|
|
panels.append(
|
|
{
|
|
"id": 20,
|
|
"type": "bargauge",
|
|
"title": "Nodes closest to full root disks",
|
|
"datasource": PROM_DS,
|
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 41},
|
|
"targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A"}],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"min": 0,
|
|
"max": 100,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "orange", "value": 70},
|
|
{"color": "red", "value": 85},
|
|
],
|
|
},
|
|
"displayName": "{{node}}",
|
|
},
|
|
"overrides": [],
|
|
},
|
|
"options": {
|
|
"displayMode": "gradient",
|
|
"orientation": "horizontal",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
|
},
|
|
"links": link_to("atlas-storage"),
|
|
}
|
|
)
|
|
|
|
storage_panels = [
|
|
(21, "Astreae usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
|
(22, "Asteria usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
|
(23, "Astreae free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
|
(24, "Asteria free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
|
]
|
|
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
|
|
panels.append(
|
|
stat_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
{"h": 6, "w": 6, "x": 6 * idx, "y": 49},
|
|
unit=unit,
|
|
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
|
links=link_to("atlas-storage"),
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
text_panel(
|
|
25,
|
|
"About this dashboard",
|
|
textwrap.dedent(
|
|
"""\
|
|
### Atlas Overview
|
|
- Anonymous users land here; follow the panel links for pod/node/storage/network drill-downs.
|
|
- Control plane workload count flags any non-system pods that slipped onto the HA nodes.
|
|
- Problem and stuck pods use kube-state-metrics so counts and detail tables match exactly."""
|
|
),
|
|
{"h": 5, "w": 24, "x": 0, "y": 55},
|
|
)
|
|
)
|
|
|
|
return {
|
|
"uid": "atlas-overview",
|
|
"title": "Atlas Overview",
|
|
"folderUid": PUBLIC_FOLDER,
|
|
"editable": False,
|
|
"annotations": {"list": []},
|
|
"panels": panels,
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "overview"],
|
|
"templating": {"list": []},
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"links": [
|
|
{"title": "Atlas Pods", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
|
|
{"title": "Atlas Nodes", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
|
|
{"title": "Atlas Storage", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
|
|
{"title": "Atlas Network", "type": "dashboard", "dashboardUid": "atlas-network", "keepTime": False},
|
|
],
|
|
}
|
|
|
|
|
|
def build_pods_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"Problem pods",
|
|
PROBLEM_PODS_EXPR,
|
|
{"h": 4, "w": 6, "x": 0, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"CrashLoop / ImagePull",
|
|
CRASHLOOP_EXPR,
|
|
{"h": 4, "w": 6, "x": 6, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Stuck terminating (>10m)",
|
|
STUCK_TERMINATING_EXPR,
|
|
{"h": 4, "w": 6, "x": 12, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
4,
|
|
"Control plane workloads",
|
|
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
|
|
{"h": 4, "w": 6, "x": 18, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
table_panel(
|
|
5,
|
|
"Pods not running",
|
|
PROBLEM_TABLE_EXPR,
|
|
{"h": 10, "w": 24, "x": 0, "y": 4},
|
|
unit="s",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
6,
|
|
"CrashLoop / ImagePull",
|
|
CRASHLOOP_TABLE_EXPR,
|
|
{"h": 10, "w": 24, "x": 0, "y": 14},
|
|
unit="s",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
7,
|
|
"Terminating >10m",
|
|
STUCK_TABLE_EXPR,
|
|
{"h": 10, "w": 24, "x": 0, "y": 24},
|
|
unit="s",
|
|
transformations=[
|
|
{"id": "labelsToFields", "options": {}},
|
|
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
|
|
],
|
|
)
|
|
)
|
|
return {
|
|
"uid": "atlas-pods",
|
|
"title": "Atlas Pods",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "pods"],
|
|
}
|
|
|
|
|
|
def build_nodes_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"Worker nodes ready",
|
|
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
|
|
{"h": 4, "w": 8, "x": 0, "y": 0},
|
|
value_suffix=WORKER_SUFFIX,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"Control plane ready",
|
|
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
|
|
{"h": 4, "w": 8, "x": 8, "y": 0},
|
|
value_suffix=CONTROL_SUFFIX,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Control plane workloads",
|
|
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
|
|
{"h": 4, "w": 8, "x": 16, "y": 0},
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
4,
|
|
"Node CPU",
|
|
node_cpu_expr(),
|
|
{"h": 9, "w": 24, "x": 0, "y": 4},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
5,
|
|
"Node RAM",
|
|
node_mem_expr(),
|
|
{"h": 9, "w": 24, "x": 0, "y": 13},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
6,
|
|
"Control plane (incl. titan-db) CPU",
|
|
node_cpu_expr(CONTROL_ALL_REGEX),
|
|
{"h": 9, "w": 12, "x": 0, "y": 22},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
7,
|
|
"Control plane (incl. titan-db) RAM",
|
|
node_mem_expr(CONTROL_ALL_REGEX),
|
|
{"h": 9, "w": 12, "x": 12, "y": 22},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
8,
|
|
"Root filesystem usage",
|
|
root_usage_expr(),
|
|
{"h": 9, "w": 24, "x": 0, "y": 31},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
)
|
|
)
|
|
return {
|
|
"uid": "atlas-nodes",
|
|
"title": "Atlas Nodes",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "nodes"],
|
|
}
|
|
|
|
|
|
def build_storage_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"Astreae usage",
|
|
astreae_usage_expr("/mnt/astreae"),
|
|
{"h": 5, "w": 6, "x": 0, "y": 0},
|
|
unit="percent",
|
|
thresholds=PERCENT_THRESHOLDS,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"Asteria usage",
|
|
astreae_usage_expr("/mnt/asteria"),
|
|
{"h": 5, "w": 6, "x": 6, "y": 0},
|
|
unit="percent",
|
|
thresholds=PERCENT_THRESHOLDS,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Astreae free",
|
|
astreae_free_expr("/mnt/astreae"),
|
|
{"h": 5, "w": 6, "x": 12, "y": 0},
|
|
unit="decbytes",
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
4,
|
|
"Asteria free",
|
|
astreae_free_expr("/mnt/asteria"),
|
|
{"h": 5, "w": 6, "x": 18, "y": 0},
|
|
unit="decbytes",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
5,
|
|
"Astreae per-node usage",
|
|
filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX),
|
|
{"h": 9, "w": 12, "x": 0, "y": 5},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
6,
|
|
"Asteria per-node usage",
|
|
filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX),
|
|
{"h": 9, "w": 12, "x": 12, "y": 5},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
7,
|
|
"Astreae usage history",
|
|
astreae_usage_expr("/mnt/astreae"),
|
|
{"h": 9, "w": 12, "x": 0, "y": 14},
|
|
unit="percent",
|
|
time_from="90d",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
8,
|
|
"Asteria usage history",
|
|
astreae_usage_expr("/mnt/asteria"),
|
|
{"h": 9, "w": 12, "x": 12, "y": 14},
|
|
unit="percent",
|
|
time_from="90d",
|
|
)
|
|
)
|
|
return {
|
|
"uid": "atlas-storage",
|
|
"title": "Atlas Storage",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "storage"],
|
|
}
|
|
|
|
|
|
def build_network_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
stat_panel(1, "Ingress traffic", NET_INGRESS_EXPR, {"h": 4, "w": 8, "x": 0, "y": 0}, unit="Bps")
|
|
)
|
|
panels.append(
|
|
stat_panel(2, "Egress traffic", NET_EGRESS_EXPR, {"h": 4, "w": 8, "x": 8, "y": 0}, unit="Bps")
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Top router req/s",
|
|
f"topk(1, {TRAEFIK_ROUTER_EXPR})",
|
|
{"h": 4, "w": 8, "x": 16, "y": 0},
|
|
unit="req/s",
|
|
display_name="{{router}}",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
4,
|
|
"Per-node throughput",
|
|
NET_SERIES_EXPR,
|
|
{"h": 8, "w": 24, "x": 0, "y": 4},
|
|
unit="Bps",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
5,
|
|
"Top namespaces",
|
|
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
|
|
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
|
|
{"h": 9, "w": 12, "x": 0, "y": 12},
|
|
unit="Bps",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
6,
|
|
"Top pods",
|
|
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
|
|
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
|
|
{"h": 9, "w": 12, "x": 12, "y": 12},
|
|
unit="Bps",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
7,
|
|
"Traefik routers (req/s)",
|
|
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
|
|
{"h": 9, "w": 12, "x": 0, "y": 21},
|
|
unit="req/s",
|
|
legend="{{router}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
8,
|
|
"Traefik entrypoints (req/s)",
|
|
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
|
|
{"h": 9, "w": 12, "x": 12, "y": 21},
|
|
unit="req/s",
|
|
legend="{{entrypoint}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
return {
|
|
"uid": "atlas-network",
|
|
"title": "Atlas Network",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "network"],
|
|
}
|
|
|
|
|
|
DASHBOARDS = {
|
|
"atlas-overview": {
|
|
"builder": build_overview,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml",
|
|
},
|
|
"atlas-pods": {
|
|
"builder": build_pods_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml",
|
|
},
|
|
"atlas-nodes": {
|
|
"builder": build_nodes_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml",
|
|
},
|
|
"atlas-storage": {
|
|
"builder": build_storage_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml",
|
|
},
|
|
"atlas-network": {
|
|
"builder": build_network_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
|
|
},
|
|
}
|
|
|
|
|
|
def write_json(uid, data):
|
|
DASHBOARD_DIR.mkdir(parents=True, exist_ok=True)
|
|
path = DASHBOARD_DIR / f"{uid}.json"
|
|
path.write_text(json.dumps(data, indent=2) + "\n")
|
|
|
|
|
|
def render_configmap(uid, info):
|
|
json_path = DASHBOARD_DIR / f"{uid}.json"
|
|
payload = json.dumps(json.loads(json_path.read_text()), indent=2)
|
|
indented = "\n".join(" " + line for line in payload.splitlines())
|
|
output_path = info["configmap"]
|
|
content = CONFIG_TEMPLATE.format(
|
|
relative_path=output_path.relative_to(ROOT),
|
|
name=output_path.stem,
|
|
key=json_path.name,
|
|
payload=indented,
|
|
)
|
|
output_path.write_text(content)
|
|
print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders")
|
|
args = parser.parse_args()
|
|
|
|
if args.build:
|
|
for uid, info in DASHBOARDS.items():
|
|
write_json(uid, info["builder"]())
|
|
|
|
for uid, info in DASHBOARDS.items():
|
|
render_configmap(uid, info)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|