monitoring: restructure grafana dashboards
This commit is contained in:
parent
b004bf99dc
commit
a41f25e66d
605
scripts/render_dashboards.py
Executable file
605
scripts/render_dashboards.py
Executable file
@ -0,0 +1,605 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate Grafana dashboards and render them into ConfigMaps.
|
||||
|
||||
Usage:
|
||||
python scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps
|
||||
python scripts/render_dashboards.py # just render ConfigMaps
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards"
|
||||
CONFIG_TEMPLATE = textwrap.dedent(
|
||||
"""# {relative_path}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {name}
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
{key}: |
|
||||
{payload}
|
||||
"""
|
||||
)
|
||||
|
||||
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Panel helper factories
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None,
|
||||
text_mode="value", legend=None):
|
||||
defaults = {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"mappings": [],
|
||||
"thresholds": thresholds
|
||||
or {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "rgba(115, 115, 115, 1)", "value": None},
|
||||
{"color": "green", "value": 1},
|
||||
],
|
||||
},
|
||||
"unit": unit,
|
||||
}
|
||||
panel = {
|
||||
"id": panel_id,
|
||||
"type": "stat",
|
||||
"title": title,
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": grid,
|
||||
"targets": [{"expr": expr, "refId": "A"}],
|
||||
"fieldConfig": {"defaults": defaults, "overrides": []},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
||||
"textMode": text_mode,
|
||||
},
|
||||
}
|
||||
if legend:
|
||||
panel["targets"][0]["legendFormat"] = legend
|
||||
return panel
|
||||
|
||||
|
||||
def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None,
|
||||
legend_display="table", legend_placement="bottom",
|
||||
legend_calcs=None, time_from=None):
|
||||
panel = {
|
||||
"id": panel_id,
|
||||
"type": "timeseries",
|
||||
"title": title,
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": grid,
|
||||
"targets": [{"expr": expr, "refId": "A"}],
|
||||
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": legend_display,
|
||||
"placement": legend_placement,
|
||||
},
|
||||
"tooltip": {"mode": "multi"},
|
||||
},
|
||||
}
|
||||
if legend:
|
||||
panel["targets"][0]["legendFormat"] = legend
|
||||
if legend_calcs:
|
||||
panel["options"]["legend"]["calcs"] = legend_calcs
|
||||
if time_from:
|
||||
panel["timeFrom"] = time_from
|
||||
return panel
|
||||
|
||||
|
||||
def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=None,
|
||||
description=None):
|
||||
panel = {
|
||||
"id": panel_id,
|
||||
"type": "table",
|
||||
"title": title,
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": grid,
|
||||
"targets": [{"expr": expr, "refId": "A"}],
|
||||
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
|
||||
"options": {"showHeader": True},
|
||||
}
|
||||
if transformations:
|
||||
panel["transformations"] = transformations
|
||||
if description:
|
||||
panel["description"] = description
|
||||
return panel
|
||||
|
||||
|
||||
def pie_panel(panel_id, title, expr, grid):
|
||||
return {
|
||||
"id": panel_id,
|
||||
"type": "piechart",
|
||||
"title": title,
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": grid,
|
||||
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
|
||||
"fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
|
||||
"options": {
|
||||
"legend": {"displayMode": "list", "placement": "right"},
|
||||
"pieType": "pie",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def text_panel(panel_id, title, content, grid):
|
||||
return {
|
||||
"id": panel_id,
|
||||
"type": "text",
|
||||
"title": title,
|
||||
"gridPos": grid,
|
||||
"datasource": None,
|
||||
"options": {"mode": "markdown", "content": content},
|
||||
}
|
||||
|
||||
|
||||
def node_cpu_expr(scope=""):
|
||||
expr = "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))"
|
||||
if scope:
|
||||
expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}"
|
||||
return expr
|
||||
|
||||
|
||||
def node_mem_expr(scope=""):
|
||||
expr = "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))"
|
||||
if scope:
|
||||
expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}"
|
||||
return expr
|
||||
|
||||
|
||||
def root_usage_expr():
|
||||
return "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)"
|
||||
|
||||
|
||||
def astreae_usage_expr(mount):
|
||||
return (
|
||||
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
|
||||
f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
|
||||
)
|
||||
|
||||
|
||||
def astreae_free_expr(mount):
|
||||
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
|
||||
|
||||
|
||||
def build_overview():
|
||||
thresholds_percent = {
|
||||
"mode": "percentage",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 70},
|
||||
{"color": "red", "value": 85},
|
||||
],
|
||||
}
|
||||
panels = []
|
||||
stats = [
|
||||
(1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})'),
|
||||
(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})'),
|
||||
(3, "Control plane ready", 'sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})'),
|
||||
(4, "Control plane schedulable", 'sum(kube_node_spec_unschedulable{node=~"titan-0a|titan-0b|titan-0c"} == 0)'),
|
||||
(5, "Problem pods", 'sum(kube_pod_status_phase{phase!~"Running|Succeeded"})'),
|
||||
(6, "Stuck terminating", 'sum(((time() - kube_pod_deletion_timestamp) > 600))'),
|
||||
]
|
||||
for idx, (panel_id, title, expr) in enumerate(stats):
|
||||
panels.append(
|
||||
stat_panel(
|
||||
panel_id,
|
||||
title,
|
||||
expr,
|
||||
{"h": 5, "w": 4, "x": 4 * idx, "y": 0},
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
7,
|
||||
"Hottest node: CPU",
|
||||
node_cpu_expr(),
|
||||
{"h": 5, "w": 4, "x": 24, "y": 0},
|
||||
unit="percent",
|
||||
thresholds=thresholds_percent,
|
||||
text_mode="value_and_name",
|
||||
legend="{{node}}",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
8,
|
||||
"Hottest node: RAM",
|
||||
node_mem_expr(),
|
||||
{"h": 5, "w": 4, "x": 28, "y": 0},
|
||||
unit="percent",
|
||||
thresholds=thresholds_percent,
|
||||
text_mode="value_and_name",
|
||||
legend="{{node}}",
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(pie_panel(9, "Namespace CPU share", 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 5}))
|
||||
panels.append(pie_panel(10, "Namespace RAM share", 'topk(10, sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace))', {"h": 9, "w": 12, "x": 12, "y": 5}))
|
||||
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
11,
|
||||
"Cluster node CPU",
|
||||
node_cpu_expr(),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 14},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_calcs=["last"],
|
||||
legend_display="table",
|
||||
legend_placement="right",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
12,
|
||||
"Cluster node RAM",
|
||||
node_mem_expr(),
|
||||
{"h": 8, "w": 12, "x": 12, "y": 14},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_calcs=["last"],
|
||||
legend_display="table",
|
||||
legend_placement="right",
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(
|
||||
table_panel(
|
||||
13,
|
||||
"Problem pods (details)",
|
||||
"(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
|
||||
{"h": 8, "w": 12, "x": 0, "y": 22},
|
||||
unit="s",
|
||||
transformations=[{"id": "labelsToFields", "options": {}}],
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
table_panel(
|
||||
14,
|
||||
"Terminating >10m",
|
||||
"(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
|
||||
{"h": 8, "w": 12, "x": 12, "y": 22},
|
||||
unit="s",
|
||||
transformations=[
|
||||
{"id": "labelsToFields", "options": {}} ,
|
||||
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
15,
|
||||
"Control plane CPU",
|
||||
node_cpu_expr("titan-0a|titan-0b|titan-0c"),
|
||||
{"h": 7, "w": 12, "x": 0, "y": 30},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
16,
|
||||
"Control plane RAM",
|
||||
node_mem_expr("titan-0a|titan-0b|titan-0c"),
|
||||
{"h": 7, "w": 12, "x": 12, "y": 30},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(
|
||||
timeseries_panel(
|
||||
17,
|
||||
"Root filesystem usage",
|
||||
root_usage_expr(),
|
||||
{"h": 8, "w": 12, "x": 0, "y": 37},
|
||||
unit="percent",
|
||||
legend="{{node}}",
|
||||
legend_calcs=["last"],
|
||||
legend_display="table",
|
||||
legend_placement="right",
|
||||
time_from="7d",
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(
|
||||
{
|
||||
"id": 18,
|
||||
"type": "bargauge",
|
||||
"title": "Nodes closest to full root disks",
|
||||
"datasource": PROM_DS,
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 37},
|
||||
"targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "percentage",
|
||||
"steps": [
|
||||
{"color": "green", "value": None},
|
||||
{"color": "yellow", "value": 50},
|
||||
{"color": "orange", "value": 70},
|
||||
{"color": "red", "value": 85},
|
||||
],
|
||||
},
|
||||
},
|
||||
"overrides": [],
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
panels.append(
|
||||
stat_panel(
|
||||
19,
|
||||
"Astreae usage",
|
||||
astreae_usage_expr("/mnt/astreae"),
|
||||
{"h": 6, "w": 6, "x": 0, "y": 45},
|
||||
unit="percent",
|
||||
thresholds=thresholds_percent,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
20,
|
||||
"Asteria usage",
|
||||
astreae_usage_expr("/mnt/asteria"),
|
||||
{"h": 6, "w": 6, "x": 6, "y": 45},
|
||||
unit="percent",
|
||||
thresholds=thresholds_percent,
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
21,
|
||||
"Astreae free",
|
||||
astreae_free_expr("/mnt/astreae"),
|
||||
{"h": 6, "w": 6, "x": 12, "y": 45},
|
||||
unit="bytesSI",
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
stat_panel(
|
||||
22,
|
||||
"Asteria free",
|
||||
astreae_free_expr("/mnt/asteria"),
|
||||
{"h": 6, "w": 6, "x": 18, "y": 45},
|
||||
unit="bytesSI",
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(
|
||||
table_panel(
|
||||
23,
|
||||
"Astreae per-node usage",
|
||||
'100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)',
|
||||
{"h": 8, "w": 12, "x": 0, "y": 51},
|
||||
unit="percent",
|
||||
transformations=[{"id": "labelsToFields", "options": {}}],
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
table_panel(
|
||||
24,
|
||||
"Asteria per-node usage",
|
||||
'100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)',
|
||||
{"h": 8, "w": 12, "x": 12, "y": 51},
|
||||
unit="percent",
|
||||
transformations=[{"id": "labelsToFields", "options": {}}],
|
||||
)
|
||||
)
|
||||
|
||||
panels.append(
|
||||
text_panel(
|
||||
25,
|
||||
"About this dashboard",
|
||||
"### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders",
|
||||
{"h": 5, "w": 24, "x": 0, "y": 59},
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"uid": "atlas-overview",
|
||||
"title": "Atlas Overview",
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {"type": "datasource", "uid": "grafana"},
|
||||
"enable": True,
|
||||
"hide": True,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard",
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": False,
|
||||
"folderUid": "atlas-overview",
|
||||
"graphTooltip": 0,
|
||||
"links": [
|
||||
{"title": "Pods dashboard", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
|
||||
{"title": "Nodes dashboard", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
|
||||
{"title": "Storage dashboard", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
|
||||
],
|
||||
"panels": panels,
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["atlas", "overview"],
|
||||
"templating": {"list": []},
|
||||
"time": {"from": "now-12h", "to": "now"},
|
||||
}
|
||||
|
||||
|
||||
def build_pods_dashboard():
|
||||
panels = []
|
||||
panels.append(
|
||||
table_panel(
|
||||
1,
|
||||
"Pods not running",
|
||||
"(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
|
||||
{"h": 10, "w": 24, "x": 0, "y": 0},
|
||||
unit="s",
|
||||
transformations=[{"id": "labelsToFields", "options": {}}],
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
table_panel(
|
||||
2,
|
||||
"CrashLoop / ImagePull",
|
||||
"(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
|
||||
{"h": 10, "w": 24, "x": 0, "y": 10},
|
||||
unit="s",
|
||||
transformations=[{"id": "labelsToFields", "options": {}}],
|
||||
)
|
||||
)
|
||||
panels.append(
|
||||
table_panel(
|
||||
3,
|
||||
"Terminating pods",
|
||||
"(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
|
||||
{"h": 10, "w": 24, "x": 0, "y": 20},
|
||||
unit="s",
|
||||
transformations=[
|
||||
{"id": "labelsToFields", "options": {}} ,
|
||||
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
|
||||
],
|
||||
)
|
||||
)
|
||||
return {
|
||||
"uid": "atlas-pods",
|
||||
"title": "Atlas Pods",
|
||||
"folderUid": "atlas-pods",
|
||||
"editable": True,
|
||||
"panels": panels,
|
||||
"time": {"from": "now-12h", "to": "now"},
|
||||
"annotations": {"list": []},
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["atlas", "pods"],
|
||||
}
|
||||
|
||||
|
||||
def build_nodes_dashboard():
|
||||
panels = []
|
||||
panels.append(stat_panel(1, "Node count", 'count(kube_node_info)', {"h": 5, "w": 6, "x": 0, "y": 0}))
|
||||
panels.append(stat_panel(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})', {"h": 5, "w": 6, "x": 6, "y": 0}))
|
||||
panels.append(stat_panel(3, "Control plane CPU avg", node_cpu_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name"))
|
||||
panels.append(stat_panel(4, "Control plane RAM avg", node_mem_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name"))
|
||||
panels.append(timeseries_panel(5, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right"))
|
||||
panels.append(timeseries_panel(6, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 14}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right"))
|
||||
panels.append(timeseries_panel(7, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 23}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="7d"))
|
||||
return {
|
||||
"uid": "atlas-nodes",
|
||||
"title": "Atlas Nodes",
|
||||
"folderUid": "atlas-nodes",
|
||||
"editable": True,
|
||||
"panels": panels,
|
||||
"time": {"from": "now-12h", "to": "now"},
|
||||
"annotations": {"list": []},
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["atlas", "nodes"],
|
||||
}
|
||||
|
||||
|
||||
def build_storage_dashboard():
|
||||
panels = []
|
||||
panels.append(stat_panel(1, "Astreae usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent"))
|
||||
panels.append(stat_panel(2, "Asteria usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent"))
|
||||
panels.append(stat_panel(3, "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="bytesSI"))
|
||||
panels.append(stat_panel(4, "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="bytesSI"))
|
||||
panels.append(timeseries_panel(5, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d"))
|
||||
panels.append(table_panel(6, "Astreae nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 0, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}]))
|
||||
panels.append(table_panel(7, "Asteria nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 12, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}]))
|
||||
return {
|
||||
"uid": "atlas-storage",
|
||||
"title": "Atlas Storage",
|
||||
"folderUid": "atlas-storage",
|
||||
"editable": True,
|
||||
"panels": panels,
|
||||
"time": {"from": "now-12h", "to": "now"},
|
||||
"annotations": {"list": []},
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["atlas", "storage"],
|
||||
}
|
||||
|
||||
|
||||
DASHBOARDS = {
|
||||
"atlas-overview": {
|
||||
"builder": build_overview,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml",
|
||||
},
|
||||
"atlas-pods": {
|
||||
"builder": build_pods_dashboard,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml",
|
||||
},
|
||||
"atlas-nodes": {
|
||||
"builder": build_nodes_dashboard,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml",
|
||||
},
|
||||
"atlas-storage": {
|
||||
"builder": build_storage_dashboard,
|
||||
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def write_json(uid: str, data: dict) -> None:
|
||||
DASHBOARD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
path = DASHBOARD_DIR / f"{uid}.json"
|
||||
path.write_text(json.dumps(data, indent=2) + "\n")
|
||||
|
||||
|
||||
def render_configmap(uid: str, data: dict) -> None:
|
||||
json_path = DASHBOARD_DIR / f"{uid}.json"
|
||||
payload = json.dumps(json.loads(json_path.read_text()), indent=2)
|
||||
indented = "\n".join(" " + line for line in payload.splitlines())
|
||||
output_path = data["configmap"]
|
||||
content = CONFIG_TEMPLATE.format(
|
||||
relative_path=output_path.relative_to(ROOT),
|
||||
name=output_path.stem,
|
||||
key=json_path.name,
|
||||
payload=indented,
|
||||
)
|
||||
output_path.write_text(content)
|
||||
print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.build:
|
||||
for uid, info in DASHBOARDS.items():
|
||||
write_json(uid, info["builder"]())
|
||||
|
||||
for uid, info in DASHBOARDS.items():
|
||||
render_configmap(uid, info)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
369
services/monitoring/dashboards/atlas-nodes.json
Normal file
369
services/monitoring/dashboards/atlas-nodes.json
Normal file
@ -0,0 +1,369 @@
|
||||
{
|
||||
"uid": "atlas-nodes",
|
||||
"title": "Atlas Nodes",
|
||||
"folderUid": "atlas-nodes",
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Node count",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(kube_node_info)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Ready nodes",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Control plane CPU avg",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Control plane RAM avg",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "timeseries",
|
||||
"title": "Node CPU",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"last"
|
||||
]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "timeseries",
|
||||
"title": "Node RAM",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"last"
|
||||
]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "timeseries",
|
||||
"title": "Root filesystem",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 23
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"timeFrom": "7d"
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-12h",
|
||||
"to": "now"
|
||||
},
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"nodes"
|
||||
]
|
||||
}
|
||||
1270
services/monitoring/dashboards/atlas-overview.json
Normal file
1270
services/monitoring/dashboards/atlas-overview.json
Normal file
File diff suppressed because it is too large
Load Diff
137
services/monitoring/dashboards/atlas-pods.json
Normal file
137
services/monitoring/dashboards/atlas-pods.json
Normal file
@ -0,0 +1,137 @@
|
||||
{
|
||||
"uid": "atlas-pods",
|
||||
"title": "Atlas Pods",
|
||||
"folderUid": "atlas-pods",
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "table",
|
||||
"title": "Pods not running",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "table",
|
||||
"title": "CrashLoop / ImagePull",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "table",
|
||||
"title": "Terminating pods",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"id": "filterByValue",
|
||||
"options": {
|
||||
"match": "Value",
|
||||
"operator": "gt",
|
||||
"value": 600
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-12h",
|
||||
"to": "now"
|
||||
},
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"pods"
|
||||
]
|
||||
}
|
||||
359
services/monitoring/dashboards/atlas-storage.json
Normal file
359
services/monitoring/dashboards/atlas-storage.json
Normal file
@ -0,0 +1,359 @@
|
||||
{
|
||||
"uid": "atlas-storage",
|
||||
"title": "Atlas Storage",
|
||||
"folderUid": "atlas-storage",
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Astreae usage",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Asteria usage",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Astreae free",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytesSI"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Asteria free",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytesSI"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "timeseries",
|
||||
"title": "Root filesystem",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"timeFrom": "30d"
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "table",
|
||||
"title": "Astreae nodes",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "table",
|
||||
"title": "Asteria nodes",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 14
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-12h",
|
||||
"to": "now"
|
||||
},
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"storage"
|
||||
]
|
||||
}
|
||||
@ -1,38 +1,22 @@
|
||||
# services/monitoring/grafana-dashboard-sre.yaml
|
||||
# services/monitoring/grafana-dashboard-nodes.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-sre
|
||||
name: grafana-dashboard-nodes
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
atlas-sre-overview.json: |
|
||||
atlas-nodes.json: |
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "grafana"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"uid": "atlas-nodes",
|
||||
"title": "Atlas Nodes",
|
||||
"folderUid": "atlas-nodes",
|
||||
"editable": true,
|
||||
"folderUid": "atlas-sre",
|
||||
"graphTooltip": 0,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Ready nodes",
|
||||
"title": "Node count",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -45,7 +29,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / count(kube_node_info) * 100",
|
||||
"expr": "count(kube_node_info)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -56,23 +40,19 @@ data:
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "percentage",
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 95
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 99
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
"unit": "none"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
@ -93,7 +73,7 @@ data:
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Pending pods",
|
||||
"title": "Ready nodes",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -106,7 +86,7 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_pod_status_phase{phase=\"Pending\"})",
|
||||
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
@ -120,16 +100,12 @@ data:
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 3
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 10
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -154,7 +130,7 @@ data:
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Unavailable deployment replicas",
|
||||
"title": "Control plane CPU avg",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -167,8 +143,9 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_deployment_status_replicas_unavailable)",
|
||||
"refId": "A"
|
||||
"expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
@ -181,20 +158,16 @@ data:
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"color": "green",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 3
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none"
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
@ -209,13 +182,13 @@ data:
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"textMode": "value_and_name"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Active alerts",
|
||||
"title": "Control plane RAM avg",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
@ -228,8 +201,9 @@ data:
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(ALERTS{alertstate=\"firing\"})",
|
||||
"refId": "A"
|
||||
"expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
@ -242,20 +216,16 @@ data:
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"color": "green",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 3
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "none"
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
@ -270,20 +240,20 @@ data:
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
"textMode": "value_and_name"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "timeseries",
|
||||
"title": "Node CPU usage",
|
||||
"title": "Node CPU",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
@ -303,7 +273,10 @@ data:
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"last"
|
||||
]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
@ -313,16 +286,16 @@ data:
|
||||
{
|
||||
"id": 6,
|
||||
"type": "timeseries",
|
||||
"title": "Node memory usage",
|
||||
"title": "Node RAM",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 5
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
@ -340,7 +313,10 @@ data:
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"last"
|
||||
]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
@ -350,201 +326,22 @@ data:
|
||||
{
|
||||
"id": 7,
|
||||
"type": "timeseries",
|
||||
"title": "Top pod CPU (5m avg)",
|
||||
"title": "Root filesystem",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(5, sum(rate(container_cpu_usage_seconds_total{pod!=\"\",container!=\"\"}[5m])) by (namespace,pod))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}/{{pod}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "cores"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "timeseries",
|
||||
"title": "Top pod memory working set",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 14
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(5, sum(container_memory_working_set_bytes{pod!=\"\",container!=\"\"}) by (namespace,pod))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}/{{pod}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "bargauge",
|
||||
"title": "Namespace restart rate (6h)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 23
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(8, sum(increase(kube_pod_container_status_restarts_total{namespace!=\"\"}[6h])) by (namespace))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "table",
|
||||
"title": "Deployments missing replicas",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 23
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10, sum by (namespace,deployment) (kube_deployment_status_replicas_unavailable))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "timeseries",
|
||||
"title": "Pod phase breakdown",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 31
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kube_pod_status_phase) by (phase)",
|
||||
"expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{phase}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "none"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "PVC usage (top 8)",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 31
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(8, sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
@ -556,28 +353,26 @@ data:
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom"
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
"timeFrom": "7d"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"sre"
|
||||
],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-12h",
|
||||
"to": "now"
|
||||
},
|
||||
"title": "Atlas SRE Overview",
|
||||
"uid": "atlas-sre",
|
||||
"version": 4
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"nodes"
|
||||
]
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
146
services/monitoring/grafana-dashboard-pods.yaml
Normal file
146
services/monitoring/grafana-dashboard-pods.yaml
Normal file
@ -0,0 +1,146 @@
|
||||
# services/monitoring/grafana-dashboard-pods.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-pods
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
atlas-pods.json: |
|
||||
{
|
||||
"uid": "atlas-pods",
|
||||
"title": "Atlas Pods",
|
||||
"folderUid": "atlas-pods",
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "table",
|
||||
"title": "Pods not running",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "table",
|
||||
"title": "CrashLoop / ImagePull",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "table",
|
||||
"title": "Terminating pods",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"id": "filterByValue",
|
||||
"options": {
|
||||
"match": "Value",
|
||||
"operator": "gt",
|
||||
"value": 600
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-12h",
|
||||
"to": "now"
|
||||
},
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"pods"
|
||||
]
|
||||
}
|
||||
368
services/monitoring/grafana-dashboard-storage.yaml
Normal file
368
services/monitoring/grafana-dashboard-storage.yaml
Normal file
@ -0,0 +1,368 @@
|
||||
# services/monitoring/grafana-dashboard-storage.yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-storage
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
atlas-storage.json: |
|
||||
{
|
||||
"uid": "atlas-storage",
|
||||
"title": "Atlas Storage",
|
||||
"folderUid": "atlas-storage",
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Astreae usage",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Asteria usage",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Astreae free",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytesSI"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Asteria free",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 0
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "rgba(115, 115, 115, 1)",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytesSI"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "timeseries",
|
||||
"title": "Root filesystem",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
|
||||
"refId": "A",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"timeFrom": "30d"
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "table",
|
||||
"title": "Astreae nodes",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "table",
|
||||
"title": "Asteria nodes",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "atlas-vm"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 14
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "labelsToFields",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-12h",
|
||||
"to": "now"
|
||||
},
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"atlas",
|
||||
"storage"
|
||||
]
|
||||
}
|
||||
@ -10,8 +10,8 @@ data:
|
||||
folders.yaml: |
|
||||
apiVersion: 1
|
||||
folders:
|
||||
- uid: atlas-public
|
||||
title: Atlas Public
|
||||
- uid: atlas-overview
|
||||
title: Atlas Overview
|
||||
permissions:
|
||||
- role: Viewer
|
||||
permission: View
|
||||
@ -19,8 +19,22 @@ data:
|
||||
permission: Edit
|
||||
- role: Admin
|
||||
permission: Admin
|
||||
- uid: atlas-sre
|
||||
title: Atlas SRE
|
||||
- uid: atlas-pods
|
||||
title: Atlas Pods
|
||||
permissions:
|
||||
- role: Editor
|
||||
permission: View
|
||||
- role: Admin
|
||||
permission: Admin
|
||||
- uid: atlas-nodes
|
||||
title: Atlas Nodes
|
||||
permissions:
|
||||
- role: Editor
|
||||
permission: View
|
||||
- role: Admin
|
||||
permission: Admin
|
||||
- uid: atlas-storage
|
||||
title: Atlas Storage
|
||||
permissions:
|
||||
- role: Editor
|
||||
permission: View
|
||||
|
||||
@ -244,8 +244,8 @@ spec:
|
||||
GF_SECURITY_ALLOW_EMBEDDING: "true"
|
||||
grafana.ini:
|
||||
server:
|
||||
domain: atlas.metrics.bstein.dev
|
||||
root_url: https://atlas.metrics.bstein.dev/
|
||||
domain: metrics.bstein.dev
|
||||
root_url: https://metrics.bstein.dev/
|
||||
auth.anonymous:
|
||||
hide_version: true
|
||||
users:
|
||||
@ -256,12 +256,12 @@ spec:
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt
|
||||
hosts:
|
||||
- atlas.metrics.bstein.dev
|
||||
- metrics.bstein.dev
|
||||
path: /
|
||||
tls:
|
||||
- secretName: grafana-atlas-metrics-tls
|
||||
- secretName: grafana-metrics-tls
|
||||
hosts:
|
||||
- atlas.metrics.bstein.dev
|
||||
- metrics.bstein.dev
|
||||
datasources:
|
||||
datasources.yaml:
|
||||
apiVersion: 1
|
||||
@ -278,25 +278,43 @@ spec:
|
||||
dashboardproviders.yaml:
|
||||
apiVersion: 1
|
||||
providers:
|
||||
- name: public
|
||||
- name: overview
|
||||
orgId: 1
|
||||
folder: Atlas Public
|
||||
folder: Atlas Overview
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: false
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards/public
|
||||
- name: sre
|
||||
path: /var/lib/grafana/dashboards/overview
|
||||
- name: pods
|
||||
orgId: 1
|
||||
folder: Atlas SRE
|
||||
folder: Atlas Pods
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards/sre
|
||||
path: /var/lib/grafana/dashboards/pods
|
||||
- name: nodes
|
||||
orgId: 1
|
||||
folder: Atlas Nodes
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards/nodes
|
||||
- name: storage
|
||||
orgId: 1
|
||||
folder: Atlas Storage
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards/storage
|
||||
dashboardsConfigMaps:
|
||||
public: grafana-dashboard-public
|
||||
sre: grafana-dashboard-sre
|
||||
overview: grafana-dashboard-overview
|
||||
pods: grafana-dashboard-pods
|
||||
nodes: grafana-dashboard-nodes
|
||||
storage: grafana-dashboard-storage
|
||||
extraConfigmapMounts:
|
||||
- name: grafana-folders
|
||||
mountPath: /etc/grafana/provisioning/folders
|
||||
@ -327,14 +345,14 @@ spec:
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt
|
||||
hosts:
|
||||
- host: atlas.alerts.bstein.dev
|
||||
- host: alerts.bstein.dev
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- secretName: alerts-bstein-dev-tls
|
||||
hosts:
|
||||
- atlas.alerts.bstein.dev
|
||||
- alerts.bstein.dev
|
||||
config:
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
@ -5,7 +5,9 @@ namespace: monitoring
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- rbac.yaml
|
||||
- grafana-dashboard-public.yaml
|
||||
- grafana-dashboard-sre.yaml
|
||||
- grafana-dashboard-overview.yaml
|
||||
- grafana-dashboard-pods.yaml
|
||||
- grafana-dashboard-nodes.yaml
|
||||
- grafana-dashboard-storage.yaml
|
||||
- grafana-folders.yaml
|
||||
- helmrelease.yaml
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user