1789 lines
53 KiB
Python
1789 lines
53 KiB
Python
#!/usr/bin/env python3
|
|
"""Generate Atlas Grafana dashboards and render them into ConfigMaps.
|
|
|
|
Usage:
|
|
scripts/dashboards_render_atlas.py --build # rebuild JSON + ConfigMaps
|
|
scripts/dashboards_render_atlas.py # re-render ConfigMaps from JSON
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import textwrap
|
|
from pathlib import Path
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Paths, folders, and shared metadata
|
|
# ---------------------------------------------------------------------------
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards"
|
|
CONFIG_TEMPLATE = textwrap.dedent(
|
|
"""# {relative_path}
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: {name}
|
|
labels:
|
|
grafana_dashboard: "1"
|
|
data:
|
|
{key}: |
|
|
{payload}
|
|
"""
|
|
)
|
|
|
|
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
|
|
PUBLIC_FOLDER = "overview"
|
|
PRIVATE_FOLDER = "atlas-internal"
|
|
|
|
PERCENT_THRESHOLDS = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "orange", "value": 75},
|
|
{"color": "red", "value": 91.5},
|
|
],
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cluster metadata
|
|
# ---------------------------------------------------------------------------
|
|
|
|
CONTROL_PLANE_NODES = ["titan-0a", "titan-0b", "titan-0c"]
|
|
CONTROL_DEPENDENCIES = ["titan-db"]
|
|
CONTROL_ALL = CONTROL_PLANE_NODES + CONTROL_DEPENDENCIES
|
|
WORKER_NODES = [
|
|
"titan-04",
|
|
"titan-05",
|
|
"titan-06",
|
|
"titan-07",
|
|
"titan-08",
|
|
"titan-09",
|
|
"titan-10",
|
|
"titan-11",
|
|
"titan-12",
|
|
"titan-13",
|
|
"titan-14",
|
|
"titan-15",
|
|
"titan-16",
|
|
"titan-17",
|
|
"titan-18",
|
|
"titan-19",
|
|
"titan-22",
|
|
"titan-24",
|
|
]
|
|
|
|
CONTROL_REGEX = "|".join(CONTROL_PLANE_NODES)
|
|
CONTROL_ALL_REGEX = "|".join(CONTROL_ALL)
|
|
WORKER_REGEX = "|".join(WORKER_NODES)
|
|
CONTROL_TOTAL = len(CONTROL_PLANE_NODES)
|
|
WORKER_TOTAL = len(WORKER_NODES)
|
|
CONTROL_SUFFIX = f"/{CONTROL_TOTAL}"
|
|
WORKER_SUFFIX = f"/{WORKER_TOTAL}"
|
|
CP_ALLOWED_NS = "kube-system|kube-public|kube-node-lease|longhorn-system|monitoring|flux-system"
|
|
LONGHORN_NODE_REGEX = "titan-1[2-9]|titan-2[24]"
|
|
GAUGE_WIDTHS = [4, 3, 3, 4, 3, 3, 4]
|
|
CONTROL_WORKLOADS_EXPR = (
|
|
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}}) or on() vector(0)'
|
|
)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# PromQL helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
NODE_INFO = 'label_replace(node_uname_info{nodename!=""}, "node", "$1", "nodename", "(.*)")'
|
|
|
|
|
|
def node_filter(regex):
|
|
"""Return a selector that evaluates to 1 for nodes matching the regex."""
|
|
return (
|
|
f'label_replace(node_uname_info{{nodename=~"{regex}"}}, '
|
|
'"node", "$1", "nodename", "(.*)")'
|
|
)
|
|
|
|
|
|
def scoped_node_expr(base, scope=""):
|
|
"""Attach nodename metadata and optionally filter to a scope regex."""
|
|
expr = f"avg by (node) (({base}) * on(instance) group_left(node) {NODE_INFO})"
|
|
if scope:
|
|
expr = f"({expr}) * on(node) group_left() {node_filter(scope)}"
|
|
return expr
|
|
|
|
|
|
def node_cpu_expr(scope=""):
|
|
idle = 'avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))'
|
|
base = f"(1 - {idle}) * 100"
|
|
return scoped_node_expr(base, scope)
|
|
|
|
|
|
def node_mem_expr(scope=""):
|
|
usage = (
|
|
"avg by (instance) ("
|
|
"(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) "
|
|
"/ node_memory_MemTotal_bytes * 100)"
|
|
)
|
|
return scoped_node_expr(usage, scope)
|
|
|
|
|
|
def filesystem_usage_expr(mount, scope=""):
|
|
base = (
|
|
f'avg by (instance) ('
|
|
f'(1 - (node_filesystem_avail_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}} '
|
|
f'/ node_filesystem_size_bytes{{mountpoint="{mount}",fstype!~"tmpfs|overlay"}})) * 100)'
|
|
)
|
|
return scoped_node_expr(base, scope)
|
|
|
|
|
|
def root_usage_expr(scope=""):
|
|
return filesystem_usage_expr("/", scope)
|
|
|
|
|
|
def astreae_usage_expr(mount):
|
|
return (
|
|
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
|
|
f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
|
|
)
|
|
|
|
|
|
def astreae_free_expr(mount):
|
|
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
|
|
|
|
|
|
def topk_with_node(expr):
|
|
return f'label_replace(topk(1, {expr}), "__name__", "$1", "node", "(.*)")'
|
|
|
|
|
|
def node_net_expr(scope=""):
|
|
base = (
|
|
'sum by (instance) ('
|
|
'rate(node_network_receive_bytes_total{device!~"lo"}[5m]) '
|
|
'+ rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))'
|
|
)
|
|
return scoped_node_expr(base, scope)
|
|
|
|
|
|
def node_io_expr(scope=""):
|
|
base = (
|
|
"sum by (instance) (rate(node_disk_read_bytes_total[5m]) "
|
|
"+ rate(node_disk_written_bytes_total[5m]))"
|
|
)
|
|
return scoped_node_expr(base, scope)
|
|
|
|
|
|
def namespace_share_expr(resource_expr):
|
|
selected = f"( {resource_expr} ) and on(namespace) ( {NAMESPACE_TOP_FILTER} )"
|
|
total = f"clamp_min(sum( {selected} ), 1)"
|
|
return f"100 * ( {selected} ) / {total}"
|
|
|
|
|
|
def namespace_cpu_share_expr():
|
|
return namespace_share_expr(NAMESPACE_CPU_RAW)
|
|
|
|
|
|
def namespace_ram_share_expr():
|
|
return namespace_share_expr(NAMESPACE_RAM_RAW)
|
|
|
|
|
|
def namespace_gpu_share_expr():
|
|
return namespace_share_expr(NAMESPACE_GPU_RAW)
|
|
|
|
|
|
PROBLEM_PODS_EXPR = (
|
|
'sum(max by (namespace,pod) (kube_pod_status_phase{phase!~"Running|Succeeded"})) '
|
|
"or on() vector(0)"
|
|
)
|
|
CRASHLOOP_EXPR = (
|
|
'sum(max by (namespace,pod) (kube_pod_container_status_waiting_reason'
|
|
'{reason=~"CrashLoopBackOff|ImagePullBackOff"})) '
|
|
"or on() vector(0)"
|
|
)
|
|
STUCK_TERMINATING_EXPR = (
|
|
'sum(max by (namespace,pod) ('
|
|
'((time() - kube_pod_deletion_timestamp{pod!=""}) > bool 600)'
|
|
' and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=""} > bool 0)'
|
|
')) '
|
|
"or on() vector(0)"
|
|
)
|
|
UPTIME_WINDOW = "30d"
|
|
TRAEFIK_READY_EXPR = (
|
|
"("
|
|
'sum(kube_deployment_status_replicas_available{namespace=~"traefik|kube-system",deployment="traefik"})'
|
|
" / clamp_min("
|
|
'sum(kube_deployment_spec_replicas{namespace=~"traefik|kube-system",deployment="traefik"}), 1)'
|
|
")"
|
|
)
|
|
CONTROL_READY_FRACTION_EXPR = (
|
|
f"(sum(kube_node_status_condition{{condition=\"Ready\",status=\"true\",node=~\"{CONTROL_REGEX}\"}})"
|
|
f" / {CONTROL_TOTAL})"
|
|
)
|
|
UPTIME_AVAIL_EXPR = (
|
|
f"min(({CONTROL_READY_FRACTION_EXPR}), ({TRAEFIK_READY_EXPR}))"
|
|
)
|
|
|
|
# Tie-breaker to deterministically pick one node per namespace when shares tie.
|
|
NODE_TIEBREAKER = " + ".join(
|
|
f"({node_filter(node)}) * 1e-6 * {idx}"
|
|
for idx, node in enumerate(CONTROL_ALL + WORKER_NODES, start=1)
|
|
)
|
|
UPTIME_AVG_EXPR = f"avg_over_time(({UPTIME_AVAIL_EXPR})[{UPTIME_WINDOW}:5m])"
|
|
UPTIME_PERCENT_EXPR = UPTIME_AVG_EXPR
|
|
UPTIME_NINES_EXPR = f"-log10(1 - clamp_max({UPTIME_AVG_EXPR}, 0.999999999))"
|
|
UPTIME_THRESHOLDS = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "yellow", "value": 3},
|
|
{"color": "green", "value": 3.5},
|
|
],
|
|
}
|
|
UPTIME_PERCENT_THRESHOLDS = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "orange", "value": 0.999},
|
|
{"color": "yellow", "value": 0.9999},
|
|
{"color": "green", "value": 0.99999},
|
|
],
|
|
}
|
|
PROBLEM_TABLE_EXPR = (
|
|
"(time() - kube_pod_created{pod!=\"\"}) "
|
|
"* on(namespace,pod) group_left(node) kube_pod_info "
|
|
"* on(namespace,pod) group_left(phase) "
|
|
"max by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})"
|
|
)
|
|
CRASHLOOP_TABLE_EXPR = (
|
|
"(time() - kube_pod_created{pod!=\"\"}) "
|
|
"* on(namespace,pod) group_left(node) kube_pod_info "
|
|
"* on(namespace,pod,container) group_left(reason) "
|
|
"max by (namespace,pod,container,reason) "
|
|
"(kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})"
|
|
)
|
|
STUCK_TABLE_EXPR = (
|
|
"("
|
|
"((time() - kube_pod_deletion_timestamp{pod!=\"\"}) "
|
|
"and on(namespace,pod) (kube_pod_deletion_timestamp{pod!=\"\"} > bool 0)) "
|
|
"* on(namespace,pod) group_left(node) kube_pod_info"
|
|
")"
|
|
)
|
|
|
|
NAMESPACE_CPU_RAW = (
|
|
'sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace)'
|
|
)
|
|
NAMESPACE_RAM_RAW = (
|
|
'sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace)'
|
|
)
|
|
GPU_NODES = ["titan-20", "titan-21", "titan-22", "titan-24"]
|
|
GPU_NODE_REGEX = "|".join(GPU_NODES)
|
|
NAMESPACE_GPU_ALLOC = (
|
|
'sum((kube_pod_container_resource_requests{namespace!="",resource="nvidia.com/gpu"}'
|
|
' or kube_pod_container_resource_limits{namespace!="",resource="nvidia.com/gpu"})) by (namespace)'
|
|
)
|
|
NAMESPACE_GPU_USAGE_SHARE = (
|
|
'sum by (namespace) (max_over_time(DCGM_FI_DEV_GPU_UTIL{namespace!=\"\",pod!=\"\"}[$__range]))'
|
|
)
|
|
NAMESPACE_GPU_USAGE_INSTANT = 'sum(DCGM_FI_DEV_GPU_UTIL{namespace!="",pod!=""}) by (namespace)'
|
|
NAMESPACE_GPU_RAW = (
|
|
"("
|
|
+ NAMESPACE_GPU_USAGE_SHARE
|
|
+ ") or on(namespace) ("
|
|
+ NAMESPACE_CPU_RAW
|
|
+ " * 0)"
|
|
)
|
|
NAMESPACE_GPU_WEIGHT = (
|
|
"("
|
|
+ NAMESPACE_GPU_ALLOC
|
|
+ ") or on(namespace) ("
|
|
+ NAMESPACE_CPU_RAW
|
|
+ " * 0)"
|
|
)
|
|
NAMESPACE_ACTIVITY_SCORE = (
|
|
"( "
|
|
+ NAMESPACE_CPU_RAW
|
|
+ " ) + ("
|
|
+ NAMESPACE_RAM_RAW
|
|
+ " / 1e9) + ("
|
|
+ NAMESPACE_GPU_WEIGHT
|
|
+ " * 100)"
|
|
)
|
|
NAMESPACE_TOP_FILTER = "(topk(10, " + NAMESPACE_ACTIVITY_SCORE + ") >= bool 0)"
|
|
TRAEFIK_ROUTER_EXPR = "sum by (router) (rate(traefik_router_requests_total[5m]))"
|
|
TRAEFIK_NET_INGRESS = (
|
|
'sum(rate(container_network_receive_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
|
|
" or on() vector(0)"
|
|
)
|
|
TRAEFIK_NET_EGRESS = (
|
|
'sum(rate(container_network_transmit_bytes_total{namespace="traefik",pod=~"traefik-.*"}[5m]))'
|
|
" or on() vector(0)"
|
|
)
|
|
NET_CLUSTER_RX = (
|
|
'sum(rate(container_network_receive_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
|
" or on() vector(0)"
|
|
)
|
|
NET_CLUSTER_TX = (
|
|
'sum(rate(container_network_transmit_bytes_total{namespace!="",pod!="",container!=""}[5m]))'
|
|
" or on() vector(0)"
|
|
)
|
|
PHYSICAL_NET_FILTER = 'device!~"lo|cni.*|veth.*|flannel.*|docker.*|virbr.*|vxlan.*|wg.*"'
|
|
NET_NODE_RX_PHYS = (
|
|
f'sum(rate(node_network_receive_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
|
|
)
|
|
NET_NODE_TX_PHYS = (
|
|
f'sum(rate(node_network_transmit_bytes_total{{{PHYSICAL_NET_FILTER}}}[5m])) or on() vector(0)'
|
|
)
|
|
NET_TOTAL_EXPR = NET_NODE_TX_PHYS
|
|
NET_INGRESS_EXPR = NET_NODE_RX_PHYS
|
|
NET_EGRESS_EXPR = NET_NODE_TX_PHYS
|
|
NET_INTERNAL_EXPR = (
|
|
'sum(rate(container_network_receive_bytes_total{namespace!="traefik",pod!=""}[5m]) '
|
|
'+ rate(container_network_transmit_bytes_total{namespace!="traefik",pod!=""}[5m]))'
|
|
' or on() vector(0)'
|
|
)
|
|
APISERVER_5XX_RATE = 'sum(rate(apiserver_request_total{code=~"5.."}[5m]))'
|
|
APISERVER_P99_LATENCY_MS = (
|
|
"histogram_quantile(0.99, sum by (le) (rate(apiserver_request_duration_seconds_bucket[5m]))) * 1000"
|
|
)
|
|
ETCD_P99_LATENCY_MS = (
|
|
"histogram_quantile(0.99, sum by (le) (rate(etcd_request_duration_seconds_bucket[5m]))) * 1000"
|
|
)
|
|
TRAEFIK_TOTAL_5M = "sum(rate(traefik_entrypoint_requests_total[5m]))"
|
|
TRAEFIK_SUCCESS_5M = 'sum(rate(traefik_entrypoint_requests_total{code!~"5.."}[5m]))'
|
|
TRAEFIK_SLI_5M = f"({TRAEFIK_SUCCESS_5M}) / clamp_min({TRAEFIK_TOTAL_5M}, 1)"
|
|
TRAEFIK_P99_LATENCY_MS = (
|
|
"histogram_quantile(0.99, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
|
|
)
|
|
TRAEFIK_P95_LATENCY_MS = (
|
|
"histogram_quantile(0.95, sum by (le) (rate(traefik_entrypoint_request_duration_seconds_bucket[5m]))) * 1000"
|
|
)
|
|
SLO_AVAILABILITY = 0.999
|
|
|
|
|
|
def traefik_sli(window):
|
|
total = f'sum(rate(traefik_entrypoint_requests_total[{window}]))'
|
|
success = f'sum(rate(traefik_entrypoint_requests_total{{code!~"5.."}}[{window}]))'
|
|
return f"({success}) / clamp_min({total}, 1)"
|
|
|
|
|
|
def traefik_burn(window):
|
|
sli = traefik_sli(window)
|
|
return f"(1 - ({sli})) / {1 - SLO_AVAILABILITY}"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Panel factories
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def stat_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
unit="none",
|
|
decimals=None,
|
|
thresholds=None,
|
|
text_mode="value",
|
|
legend=None,
|
|
instant=False,
|
|
value_suffix=None,
|
|
links=None,
|
|
):
|
|
"""Return a Grafana stat panel definition."""
|
|
defaults = {
|
|
"color": {"mode": "thresholds"},
|
|
"mappings": [],
|
|
"thresholds": thresholds
|
|
or {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "rgba(115, 115, 115, 1)", "value": None},
|
|
{"color": "green", "value": 1},
|
|
],
|
|
},
|
|
"unit": unit,
|
|
"custom": {"displayMode": "auto"},
|
|
}
|
|
if value_suffix:
|
|
defaults["custom"]["valueSuffix"] = value_suffix
|
|
if decimals is not None:
|
|
defaults["decimals"] = decimals
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "stat",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [{"expr": expr, "refId": "A"}],
|
|
"fieldConfig": {"defaults": defaults, "overrides": []},
|
|
"options": {
|
|
"colorMode": "value",
|
|
"graphMode": "area",
|
|
"justifyMode": "center",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
|
"textMode": text_mode,
|
|
},
|
|
}
|
|
if legend:
|
|
panel["targets"][0]["legendFormat"] = legend
|
|
if instant:
|
|
panel["targets"][0]["instant"] = True
|
|
if links:
|
|
panel["links"] = links
|
|
return panel
|
|
|
|
|
|
def gauge_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
min_value=0,
|
|
max_value=1,
|
|
thresholds=None,
|
|
links=None,
|
|
):
|
|
return {
|
|
"id": panel_id,
|
|
"type": "gauge",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [{"expr": expr, "refId": "A"}],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"min": min_value,
|
|
"max": max_value,
|
|
"thresholds": thresholds
|
|
or {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": max_value},
|
|
],
|
|
},
|
|
},
|
|
"overrides": [],
|
|
},
|
|
"options": {
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
|
"orientation": "auto",
|
|
"showThresholdMarkers": False,
|
|
"showThresholdLabels": False,
|
|
},
|
|
**({"links": links} if links else {}),
|
|
}
|
|
|
|
|
|
def timeseries_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
unit="none",
|
|
legend=None,
|
|
legend_display="table",
|
|
legend_placement="bottom",
|
|
legend_calcs=None,
|
|
time_from=None,
|
|
links=None,
|
|
):
|
|
"""Return a Grafana time-series panel definition."""
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "timeseries",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [{"expr": expr, "refId": "A"}],
|
|
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
|
|
"options": {
|
|
"legend": {
|
|
"displayMode": legend_display,
|
|
"placement": legend_placement,
|
|
},
|
|
"tooltip": {"mode": "multi"},
|
|
},
|
|
}
|
|
if legend:
|
|
panel["targets"][0]["legendFormat"] = legend
|
|
if legend_calcs:
|
|
panel["options"]["legend"]["calcs"] = legend_calcs
|
|
if time_from:
|
|
panel["timeFrom"] = time_from
|
|
if links:
|
|
panel["links"] = links
|
|
return panel
|
|
|
|
|
|
def table_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
unit="none",
|
|
transformations=None,
|
|
instant=False,
|
|
options=None,
|
|
):
|
|
"""Return a Grafana table panel definition."""
|
|
# Optional PromQL subquery helpers in expr: share(), etc.
|
|
panel_options = {"showHeader": True, "columnFilters": False}
|
|
if options:
|
|
panel_options.update(options)
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "table",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [{"expr": expr, "refId": "A", **({"instant": True} if instant else {})}],
|
|
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
|
|
"options": panel_options,
|
|
}
|
|
if transformations:
|
|
panel["transformations"] = transformations
|
|
return panel
|
|
|
|
|
|
def pie_panel(panel_id, title, expr, grid):
|
|
"""Return a pie chart panel with readable namespace labels."""
|
|
return {
|
|
"id": panel_id,
|
|
"type": "piechart",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"color": {"mode": "palette-classic"},
|
|
},
|
|
"overrides": [],
|
|
},
|
|
"options": {
|
|
"legend": {"displayMode": "list", "placement": "right"},
|
|
"pieType": "pie",
|
|
"displayLabels": [],
|
|
"tooltip": {"mode": "single"},
|
|
"colorScheme": "interpolateSpectral",
|
|
"colorBy": "value",
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
|
},
|
|
}
|
|
|
|
|
|
def bargauge_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
grid,
|
|
*,
|
|
unit="none",
|
|
links=None,
|
|
limit=None,
|
|
thresholds=None,
|
|
decimals=None,
|
|
instant=False,
|
|
):
|
|
"""Return a bar gauge panel with label-aware reduction."""
|
|
panel = {
|
|
"id": panel_id,
|
|
"type": "bargauge",
|
|
"title": title,
|
|
"datasource": PROM_DS,
|
|
"gridPos": grid,
|
|
"targets": [
|
|
{"expr": expr, "refId": "A", "legendFormat": "{{node}}", **({"instant": True} if instant else {})}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": unit,
|
|
"min": 0,
|
|
"max": 100 if unit == "percent" else None,
|
|
"thresholds": thresholds
|
|
or {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "orange", "value": 70},
|
|
{"color": "red", "value": 85},
|
|
],
|
|
},
|
|
},
|
|
"overrides": [],
|
|
},
|
|
"options": {
|
|
"displayMode": "gradient",
|
|
"orientation": "horizontal",
|
|
"reduceOptions": {
|
|
"calcs": ["lastNotNull"],
|
|
"fields": "",
|
|
"values": False,
|
|
},
|
|
},
|
|
}
|
|
if decimals is not None:
|
|
panel["fieldConfig"]["defaults"]["decimals"] = decimals
|
|
if links:
|
|
panel["links"] = links
|
|
# Keep bars ordered by value descending for readability.
|
|
panel["transformations"] = [
|
|
{
|
|
"id": "sortBy",
|
|
"options": {"fields": ["Value"], "order": "desc"},
|
|
}
|
|
]
|
|
if limit:
|
|
panel["transformations"].append({"id": "limit", "options": {"limit": limit}})
|
|
return panel
|
|
|
|
|
|
def text_panel(panel_id, title, content, grid):
|
|
return {
|
|
"id": panel_id,
|
|
"type": "text",
|
|
"title": title,
|
|
"gridPos": grid,
|
|
"datasource": None,
|
|
"options": {"mode": "markdown", "content": content},
|
|
}
|
|
|
|
|
|
def link_to(uid):
|
|
return [{"title": f"Open {uid} dashboard", "url": f"/d/{uid}", "targetBlank": True}]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Dashboard builders
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def build_overview():
|
|
panels = []
|
|
|
|
count_thresholds = {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 3},
|
|
],
|
|
}
|
|
|
|
row1_stats = [
|
|
{
|
|
"id": 2,
|
|
"title": "Control Plane Ready",
|
|
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
|
|
"kind": "gauge",
|
|
"max_value": CONTROL_TOTAL,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "green", "value": CONTROL_TOTAL},
|
|
],
|
|
},
|
|
},
|
|
{
|
|
"id": 3,
|
|
"title": "Control Plane Workloads",
|
|
"expr": CONTROL_WORKLOADS_EXPR,
|
|
"kind": "stat",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 3},
|
|
],
|
|
},
|
|
"links": link_to("atlas-pods"),
|
|
},
|
|
{
|
|
"id": 5,
|
|
"title": "Stuck Terminating",
|
|
"expr": STUCK_TERMINATING_EXPR,
|
|
"kind": "stat",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 3},
|
|
],
|
|
},
|
|
"links": link_to("atlas-pods"),
|
|
},
|
|
{
|
|
"id": 27,
|
|
"title": "Atlas Availability (30d)",
|
|
"expr": UPTIME_PERCENT_EXPR,
|
|
"kind": "stat",
|
|
"thresholds": UPTIME_PERCENT_THRESHOLDS,
|
|
"unit": "percentunit",
|
|
"decimals": 3,
|
|
"text_mode": "value",
|
|
},
|
|
{
|
|
"id": 4,
|
|
"title": "Problem Pods",
|
|
"expr": PROBLEM_PODS_EXPR,
|
|
"kind": "stat",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 3},
|
|
],
|
|
},
|
|
"links": link_to("atlas-pods"),
|
|
},
|
|
{
|
|
"id": 6,
|
|
"title": "CrashLoop / ImagePull",
|
|
"expr": CRASHLOOP_EXPR,
|
|
"kind": "stat",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 3},
|
|
],
|
|
},
|
|
"links": link_to("atlas-pods"),
|
|
},
|
|
{
|
|
"id": 1,
|
|
"title": "Workers Ready",
|
|
"expr": f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
|
|
"kind": "gauge",
|
|
"max_value": WORKER_TOTAL,
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "orange", "value": WORKER_TOTAL - 2},
|
|
{"color": "yellow", "value": WORKER_TOTAL - 1},
|
|
{"color": "green", "value": WORKER_TOTAL},
|
|
],
|
|
},
|
|
},
|
|
]
|
|
|
|
def gauge_grid(idx):
|
|
width = GAUGE_WIDTHS[idx] if idx < len(GAUGE_WIDTHS) else 4
|
|
x = sum(GAUGE_WIDTHS[:idx])
|
|
return width, x
|
|
|
|
for idx, item in enumerate(row1_stats):
|
|
panel_id = item["id"]
|
|
width, x = gauge_grid(idx)
|
|
grid = {"h": 5, "w": width, "x": x, "y": 0}
|
|
kind = item.get("kind", "gauge")
|
|
if kind == "stat":
|
|
panels.append(
|
|
stat_panel(
|
|
panel_id,
|
|
item["title"],
|
|
item["expr"],
|
|
grid,
|
|
thresholds=item.get("thresholds"),
|
|
legend=None,
|
|
links=item.get("links"),
|
|
text_mode=item.get("text_mode", "value"),
|
|
value_suffix=item.get("value_suffix"),
|
|
unit=item.get("unit", "none"),
|
|
decimals=item.get("decimals"),
|
|
)
|
|
)
|
|
else:
|
|
panels.append(
|
|
gauge_panel(
|
|
panel_id,
|
|
item["title"],
|
|
item["expr"],
|
|
grid,
|
|
min_value=0,
|
|
max_value=item.get("max_value", 5),
|
|
thresholds=item.get("thresholds"),
|
|
links=item.get("links"),
|
|
)
|
|
)
|
|
|
|
hottest = [
|
|
(7, "Hottest node: CPU", topk_with_node(node_cpu_expr()), "percent"),
|
|
(8, "Hottest node: RAM", topk_with_node(node_mem_expr()), "percent"),
|
|
(9, "Hottest node: NET (rx+tx)", topk_with_node(node_net_expr()), "Bps"),
|
|
(10, "Hottest node: I/O (r+w)", topk_with_node(node_io_expr()), "Bps"),
|
|
]
|
|
for idx, (panel_id, title, expr, unit) in enumerate(hottest):
|
|
panels.append(
|
|
stat_panel(
|
|
panel_id,
|
|
title,
|
|
f"{expr}",
|
|
{"h": 3, "w": 6, "x": 6 * idx, "y": 5},
|
|
unit=unit,
|
|
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
|
text_mode="name_and_value",
|
|
legend="{{node}}",
|
|
instant=True,
|
|
links=link_to("atlas-nodes"),
|
|
)
|
|
)
|
|
|
|
storage_panels = [
|
|
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
|
|
(24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
|
|
(25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"),
|
|
(26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"),
|
|
]
|
|
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
|
|
panels.append(
|
|
stat_panel(
|
|
panel_id,
|
|
title,
|
|
expr,
|
|
{"h": 6, "w": 6, "x": 6 * idx, "y": 10},
|
|
unit=unit,
|
|
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
|
|
links=link_to("atlas-storage"),
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
pie_panel(
|
|
11,
|
|
"Namespace CPU Share",
|
|
namespace_cpu_share_expr(),
|
|
{"h": 9, "w": 8, "x": 0, "y": 16},
|
|
)
|
|
)
|
|
panels.append(
|
|
pie_panel(
|
|
12,
|
|
"Namespace GPU Share",
|
|
namespace_gpu_share_expr(),
|
|
{"h": 9, "w": 8, "x": 8, "y": 16},
|
|
)
|
|
)
|
|
panels.append(
|
|
pie_panel(
|
|
13,
|
|
"Namespace RAM Share",
|
|
namespace_ram_share_expr(),
|
|
{"h": 9, "w": 8, "x": 16, "y": 16},
|
|
)
|
|
)
|
|
|
|
worker_filter = f"{WORKER_REGEX}"
|
|
panels.append(
|
|
timeseries_panel(
|
|
14,
|
|
"Worker Node CPU",
|
|
node_cpu_expr(worker_filter),
|
|
{"h": 12, "w": 12, "x": 0, "y": 32},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
links=link_to("atlas-nodes"),
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
15,
|
|
"Worker Node RAM",
|
|
node_mem_expr(worker_filter),
|
|
{"h": 12, "w": 12, "x": 12, "y": 32},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
links=link_to("atlas-nodes"),
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
timeseries_panel(
|
|
16,
|
|
"Control plane CPU",
|
|
node_cpu_expr(CONTROL_ALL_REGEX),
|
|
{"h": 10, "w": 12, "x": 0, "y": 44},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
17,
|
|
"Control plane RAM",
|
|
node_mem_expr(CONTROL_ALL_REGEX),
|
|
{"h": 10, "w": 12, "x": 12, "y": 44},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
pie_panel(
|
|
28,
|
|
"Node Pod Share",
|
|
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
|
|
{"h": 10, "w": 12, "x": 0, "y": 54},
|
|
)
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
29,
|
|
"Top Nodes by Pod Count",
|
|
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
|
|
{"h": 10, "w": 12, "x": 12, "y": 54},
|
|
unit="none",
|
|
limit=12,
|
|
decimals=0,
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "orange", "value": 75},
|
|
{"color": "red", "value": 100},
|
|
],
|
|
},
|
|
instant=True,
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
timeseries_panel(
|
|
18,
|
|
"Cluster Ingress Throughput",
|
|
NET_INGRESS_EXPR,
|
|
{"h": 7, "w": 8, "x": 0, "y": 25},
|
|
unit="Bps",
|
|
legend="Ingress (Traefik)",
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
links=link_to("atlas-network"),
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
19,
|
|
"Cluster Egress Throughput",
|
|
NET_EGRESS_EXPR,
|
|
{"h": 7, "w": 8, "x": 8, "y": 25},
|
|
unit="Bps",
|
|
legend="Egress (Traefik)",
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
links=link_to("atlas-network"),
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
20,
|
|
"Intra-Cluster Throughput",
|
|
NET_INTERNAL_EXPR,
|
|
{"h": 7, "w": 8, "x": 16, "y": 25},
|
|
unit="Bps",
|
|
legend="Internal traffic",
|
|
legend_display="list",
|
|
legend_placement="bottom",
|
|
links=link_to("atlas-network"),
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
timeseries_panel(
|
|
21,
|
|
"Root Filesystem Usage",
|
|
root_usage_expr(),
|
|
{"h": 16, "w": 12, "x": 0, "y": 64},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
links=link_to("atlas-storage"),
|
|
)
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
22,
|
|
"Nodes Closest to Full Root Disks",
|
|
f"topk(12, {root_usage_expr()})",
|
|
{"h": 16, "w": 12, "x": 12, "y": 64},
|
|
unit="percent",
|
|
thresholds=PERCENT_THRESHOLDS,
|
|
links=link_to("atlas-storage"),
|
|
)
|
|
)
|
|
|
|
return {
|
|
"uid": "atlas-overview",
|
|
"title": "Atlas Overview",
|
|
"folderUid": PUBLIC_FOLDER,
|
|
"editable": False,
|
|
"annotations": {"list": []},
|
|
"panels": panels,
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "overview"],
|
|
"templating": {"list": []},
|
|
"time": {"from": "now-1h", "to": "now"},
|
|
"refresh": "1m",
|
|
"links": [],
|
|
}
|
|
|
|
|
|
def build_pods_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"Problem Pods",
|
|
PROBLEM_PODS_EXPR,
|
|
{"h": 4, "w": 6, "x": 0, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"CrashLoop / ImagePull",
|
|
CRASHLOOP_EXPR,
|
|
{"h": 4, "w": 6, "x": 6, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Stuck Terminating (>10m)",
|
|
STUCK_TERMINATING_EXPR,
|
|
{"h": 4, "w": 6, "x": 12, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
4,
|
|
"Control Plane Workloads",
|
|
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
|
|
{"h": 4, "w": 6, "x": 18, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "red", "value": 1},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
|
|
panels.append(
|
|
table_panel(
|
|
5,
|
|
"Pods Not Running",
|
|
PROBLEM_TABLE_EXPR,
|
|
{"h": 10, "w": 24, "x": 0, "y": 4},
|
|
unit="s",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
6,
|
|
"CrashLoop / ImagePull",
|
|
CRASHLOOP_TABLE_EXPR,
|
|
{"h": 10, "w": 24, "x": 0, "y": 14},
|
|
unit="s",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
7,
|
|
"Terminating >10m",
|
|
STUCK_TABLE_EXPR,
|
|
{"h": 10, "w": 24, "x": 0, "y": 24},
|
|
unit="s",
|
|
transformations=[
|
|
{"id": "labelsToFields", "options": {}},
|
|
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
|
|
],
|
|
)
|
|
)
|
|
panels.append(
|
|
pie_panel(
|
|
8,
|
|
"Node Pod Share",
|
|
'(sum(kube_pod_info{pod!="" , node!=""}) by (node) / clamp_min(sum(kube_pod_info{pod!="" , node!=""}), 1)) * 100',
|
|
{"h": 8, "w": 12, "x": 12, "y": 34},
|
|
)
|
|
)
|
|
panels.append(
|
|
bargauge_panel(
|
|
9,
|
|
"Top Nodes by Pod Count",
|
|
'topk(12, sum(kube_pod_info{pod!="" , node!=""}) by (node))',
|
|
{"h": 8, "w": 12, "x": 0, "y": 34},
|
|
unit="none",
|
|
limit=12,
|
|
decimals=0,
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "orange", "value": 75},
|
|
{"color": "red", "value": 100},
|
|
],
|
|
},
|
|
instant=True,
|
|
)
|
|
)
|
|
|
|
share_expr = (
|
|
'(sum by (namespace,node) (kube_pod_info{pod!=""}) '
|
|
'/ on(namespace) group_left() clamp_min(sum by (namespace) (kube_pod_info{pod!=""}), 1) * 100)'
|
|
)
|
|
mask_expr = (
|
|
f"{share_expr} == bool on(namespace) group_left() "
|
|
f"(max by (namespace) ({share_expr}))"
|
|
)
|
|
nonzero_expr = f"{share_expr} > bool 0"
|
|
panels.append(
|
|
table_panel(
|
|
10,
|
|
"Namespace Plurality by Node v19",
|
|
(
|
|
f"{share_expr} * on(namespace,node) group_left() "
|
|
f"({mask_expr}) * on(namespace,node) group_left() ({nonzero_expr})"
|
|
),
|
|
{"h": 8, "w": 24, "x": 0, "y": 42},
|
|
unit="percent",
|
|
transformations=[
|
|
{"id": "labelsToFields", "options": {}},
|
|
{"id": "organize", "options": {"excludeByName": {"Time": True}}},
|
|
{"id": "sortBy", "options": {"fields": ["node", "Value"], "order": "asc"}},
|
|
],
|
|
instant=True,
|
|
options={"showColumnFilters": False},
|
|
)
|
|
)
|
|
|
|
return {
|
|
"uid": "atlas-pods",
|
|
"title": "Atlas Pods",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "pods"],
|
|
}
|
|
|
|
|
|
def build_nodes_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"Worker Nodes Ready",
|
|
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{WORKER_REGEX}"}})',
|
|
{"h": 4, "w": 8, "x": 0, "y": 0},
|
|
value_suffix=WORKER_SUFFIX,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"Control Plane Ready",
|
|
f'sum(kube_node_status_condition{{condition="Ready",status="true",node=~"{CONTROL_REGEX}"}})',
|
|
{"h": 4, "w": 8, "x": 8, "y": 0},
|
|
value_suffix=CONTROL_SUFFIX,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Control Plane Workloads",
|
|
f'sum(kube_pod_info{{node=~"{CONTROL_REGEX}",namespace!~"{CP_ALLOWED_NS}"}})',
|
|
{"h": 4, "w": 8, "x": 16, "y": 0},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
9,
|
|
"API Server 5xx rate",
|
|
APISERVER_5XX_RATE,
|
|
{"h": 4, "w": 8, "x": 0, "y": 4},
|
|
unit="req/s",
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 0.05},
|
|
{"color": "orange", "value": 0.2},
|
|
{"color": "red", "value": 0.5},
|
|
],
|
|
},
|
|
decimals=3,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
10,
|
|
"API Server P99 latency",
|
|
APISERVER_P99_LATENCY_MS,
|
|
{"h": 4, "w": 8, "x": 8, "y": 4},
|
|
unit="ms",
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 250},
|
|
{"color": "orange", "value": 400},
|
|
{"color": "red", "value": 600},
|
|
],
|
|
},
|
|
decimals=1,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
11,
|
|
"etcd P99 latency",
|
|
ETCD_P99_LATENCY_MS,
|
|
{"h": 4, "w": 8, "x": 16, "y": 4},
|
|
unit="ms",
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 50},
|
|
{"color": "orange", "value": 100},
|
|
{"color": "red", "value": 200},
|
|
],
|
|
},
|
|
decimals=1,
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
4,
|
|
"Node CPU",
|
|
node_cpu_expr(),
|
|
{"h": 9, "w": 24, "x": 0, "y": 8},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
5,
|
|
"Node RAM",
|
|
node_mem_expr(),
|
|
{"h": 9, "w": 24, "x": 0, "y": 17},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_calcs=["last"],
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
6,
|
|
"Control Plane (incl. titan-db) CPU",
|
|
node_cpu_expr(CONTROL_ALL_REGEX),
|
|
{"h": 9, "w": 12, "x": 0, "y": 26},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
7,
|
|
"Control Plane (incl. titan-db) RAM",
|
|
node_mem_expr(CONTROL_ALL_REGEX),
|
|
{"h": 9, "w": 12, "x": 12, "y": 26},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
8,
|
|
"Root Filesystem Usage",
|
|
root_usage_expr(),
|
|
{"h": 9, "w": 24, "x": 0, "y": 35},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
)
|
|
)
|
|
return {
|
|
"uid": "atlas-nodes",
|
|
"title": "Atlas Nodes",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "nodes"],
|
|
}
|
|
|
|
|
|
def build_storage_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"Astreae Usage",
|
|
astreae_usage_expr("/mnt/astreae"),
|
|
{"h": 5, "w": 6, "x": 0, "y": 0},
|
|
unit="percent",
|
|
thresholds=PERCENT_THRESHOLDS,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"Asteria Usage",
|
|
astreae_usage_expr("/mnt/asteria"),
|
|
{"h": 5, "w": 6, "x": 6, "y": 0},
|
|
unit="percent",
|
|
thresholds=PERCENT_THRESHOLDS,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Astreae Free",
|
|
astreae_free_expr("/mnt/astreae"),
|
|
{"h": 5, "w": 6, "x": 12, "y": 0},
|
|
unit="decbytes",
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
4,
|
|
"Asteria Free",
|
|
astreae_free_expr("/mnt/asteria"),
|
|
{"h": 5, "w": 6, "x": 18, "y": 0},
|
|
unit="decbytes",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
5,
|
|
"Astreae Per-Node Usage",
|
|
filesystem_usage_expr("/mnt/astreae", LONGHORN_NODE_REGEX),
|
|
{"h": 9, "w": 12, "x": 0, "y": 5},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
6,
|
|
"Asteria Per-Node Usage",
|
|
filesystem_usage_expr("/mnt/asteria", LONGHORN_NODE_REGEX),
|
|
{"h": 9, "w": 12, "x": 12, "y": 5},
|
|
unit="percent",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
time_from="30d",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
7,
|
|
"Astreae Usage History",
|
|
astreae_usage_expr("/mnt/astreae"),
|
|
{"h": 9, "w": 12, "x": 0, "y": 14},
|
|
unit="percent",
|
|
time_from="90d",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
8,
|
|
"Asteria Usage History",
|
|
astreae_usage_expr("/mnt/asteria"),
|
|
{"h": 9, "w": 12, "x": 12, "y": 14},
|
|
unit="percent",
|
|
time_from="90d",
|
|
)
|
|
)
|
|
return {
|
|
"uid": "atlas-storage",
|
|
"title": "Atlas Storage",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "storage"],
|
|
}
|
|
|
|
|
|
def build_network_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
stat_panel(
|
|
1,
|
|
"Ingress Success Rate (5m)",
|
|
TRAEFIK_SLI_5M,
|
|
{"h": 4, "w": 6, "x": 0, "y": 0},
|
|
unit="percentunit",
|
|
decimals=2,
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "red", "value": None},
|
|
{"color": "orange", "value": 0.995},
|
|
{"color": "yellow", "value": 0.999},
|
|
{"color": "green", "value": 0.9995},
|
|
],
|
|
},
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
2,
|
|
"Error Budget Burn (1h)",
|
|
traefik_burn("1h"),
|
|
{"h": 4, "w": 6, "x": 6, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 4},
|
|
],
|
|
},
|
|
decimals=2,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
3,
|
|
"Error Budget Burn (6h)",
|
|
traefik_burn("6h"),
|
|
{"h": 4, "w": 6, "x": 12, "y": 0},
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "orange", "value": 2},
|
|
{"color": "red", "value": 4},
|
|
],
|
|
},
|
|
decimals=2,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
4,
|
|
"Edge P99 Latency (ms)",
|
|
TRAEFIK_P99_LATENCY_MS,
|
|
{"h": 4, "w": 6, "x": 18, "y": 0},
|
|
unit="ms",
|
|
thresholds={
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{"color": "green", "value": None},
|
|
{"color": "yellow", "value": 200},
|
|
{"color": "orange", "value": 350},
|
|
{"color": "red", "value": 500},
|
|
],
|
|
},
|
|
decimals=1,
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
5,
|
|
"Ingress Traffic",
|
|
NET_INGRESS_EXPR,
|
|
{"h": 4, "w": 8, "x": 0, "y": 4},
|
|
unit="Bps",
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
6,
|
|
"Egress Traffic",
|
|
NET_EGRESS_EXPR,
|
|
{"h": 4, "w": 8, "x": 8, "y": 4},
|
|
unit="Bps",
|
|
)
|
|
)
|
|
panels.append(
|
|
stat_panel(
|
|
7,
|
|
"Intra-Cluster Traffic",
|
|
NET_INTERNAL_EXPR,
|
|
{"h": 4, "w": 8, "x": 16, "y": 4},
|
|
unit="Bps",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
8,
|
|
"Per-Node Throughput",
|
|
f'avg by (node) (({NET_NODE_TX_PHYS} + {NET_NODE_RX_PHYS}) * on(instance) group_left(node) {NODE_INFO})',
|
|
{"h": 8, "w": 24, "x": 0, "y": 8},
|
|
unit="Bps",
|
|
legend="{{node}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
9,
|
|
"Top Namespaces",
|
|
'topk(10, sum(rate(container_network_transmit_bytes_total{namespace!=""}[5m]) '
|
|
'+ rate(container_network_receive_bytes_total{namespace!=""}[5m])) by (namespace))',
|
|
{"h": 9, "w": 12, "x": 0, "y": 16},
|
|
unit="Bps",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
10,
|
|
"Top Pods",
|
|
'topk(10, sum(rate(container_network_transmit_bytes_total{pod!=""}[5m]) '
|
|
'+ rate(container_network_receive_bytes_total{pod!=""}[5m])) by (namespace,pod))',
|
|
{"h": 9, "w": 12, "x": 12, "y": 16},
|
|
unit="Bps",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
11,
|
|
"Traefik Routers (req/s)",
|
|
f"topk(10, {TRAEFIK_ROUTER_EXPR})",
|
|
{"h": 9, "w": 12, "x": 0, "y": 25},
|
|
unit="req/s",
|
|
legend="{{router}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
12,
|
|
"Traefik Entrypoints (req/s)",
|
|
'sum by (entrypoint) (rate(traefik_entrypoint_requests_total[5m]))',
|
|
{"h": 9, "w": 12, "x": 12, "y": 25},
|
|
unit="req/s",
|
|
legend="{{entrypoint}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
return {
|
|
"uid": "atlas-network",
|
|
"title": "Atlas Network",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "network"],
|
|
}
|
|
|
|
|
|
def build_gpu_dashboard():
|
|
panels = []
|
|
panels.append(
|
|
pie_panel(
|
|
1,
|
|
"Namespace GPU Share",
|
|
namespace_gpu_share_expr(),
|
|
{"h": 8, "w": 12, "x": 0, "y": 0},
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
2,
|
|
"GPU Util by Namespace",
|
|
NAMESPACE_GPU_USAGE_INSTANT,
|
|
{"h": 8, "w": 12, "x": 12, "y": 0},
|
|
unit="percent",
|
|
legend="{{namespace}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
timeseries_panel(
|
|
3,
|
|
"GPU Util by Node",
|
|
'sum by (Hostname) (DCGM_FI_DEV_GPU_UTIL{pod!=""})',
|
|
{"h": 8, "w": 12, "x": 0, "y": 8},
|
|
unit="percent",
|
|
legend="{{Hostname}}",
|
|
legend_display="table",
|
|
legend_placement="right",
|
|
)
|
|
)
|
|
panels.append(
|
|
table_panel(
|
|
4,
|
|
"Top Pods by GPU Util",
|
|
'topk(10, sum(DCGM_FI_DEV_GPU_UTIL{pod!=""}) by (namespace,pod,Hostname))',
|
|
{"h": 8, "w": 12, "x": 12, "y": 8},
|
|
unit="percent",
|
|
transformations=[{"id": "labelsToFields", "options": {}}],
|
|
)
|
|
)
|
|
return {
|
|
"uid": "atlas-gpu",
|
|
"title": "Atlas GPU",
|
|
"folderUid": PRIVATE_FOLDER,
|
|
"editable": True,
|
|
"panels": panels,
|
|
"time": {"from": "now-12h", "to": "now"},
|
|
"annotations": {"list": []},
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["atlas", "gpu"],
|
|
}
|
|
|
|
|
|
DASHBOARDS = {
|
|
"atlas-overview": {
|
|
"builder": build_overview,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml",
|
|
},
|
|
"atlas-pods": {
|
|
"builder": build_pods_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml",
|
|
},
|
|
"atlas-nodes": {
|
|
"builder": build_nodes_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml",
|
|
},
|
|
"atlas-storage": {
|
|
"builder": build_storage_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml",
|
|
},
|
|
"atlas-network": {
|
|
"builder": build_network_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-network.yaml",
|
|
},
|
|
"atlas-gpu": {
|
|
"builder": build_gpu_dashboard,
|
|
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",
|
|
},
|
|
}
|
|
|
|
|
|
def write_json(uid, data):
|
|
DASHBOARD_DIR.mkdir(parents=True, exist_ok=True)
|
|
path = DASHBOARD_DIR / f"{uid}.json"
|
|
path.write_text(json.dumps(data, indent=2) + "\n")
|
|
|
|
|
|
def render_configmap(uid, info):
|
|
json_path = DASHBOARD_DIR / f"{uid}.json"
|
|
payload = json.dumps(json.loads(json_path.read_text()), indent=2)
|
|
indented = "\n".join(" " + line for line in payload.splitlines())
|
|
output_path = info["configmap"]
|
|
content = CONFIG_TEMPLATE.format(
|
|
relative_path=output_path.relative_to(ROOT),
|
|
name=output_path.stem,
|
|
key=json_path.name,
|
|
payload=indented,
|
|
)
|
|
output_path.write_text(content)
|
|
print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders")
|
|
args = parser.parse_args()
|
|
|
|
if args.build:
|
|
for uid, info in DASHBOARDS.items():
|
|
write_json(uid, info["builder"]())
|
|
|
|
for uid, info in DASHBOARDS.items():
|
|
render_configmap(uid, info)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|