monitoring: restructure grafana dashboards

This commit is contained in:
Brad Stein 2025-11-17 14:22:46 -03:00
parent b004bf99dc
commit a41f25e66d
12 changed files with 3847 additions and 530 deletions

605
scripts/render_dashboards.py Executable file
View File

@ -0,0 +1,605 @@
#!/usr/bin/env python3
"""Generate Grafana dashboards and render them into ConfigMaps.
Usage:
python scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps
python scripts/render_dashboards.py # just render ConfigMaps
"""
import argparse
import json
import textwrap
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards"
CONFIG_TEMPLATE = textwrap.dedent(
"""# {relative_path}
apiVersion: v1
kind: ConfigMap
metadata:
name: {name}
labels:
grafana_dashboard: "1"
data:
{key}: |
{payload}
"""
)
PROM_DS = {"type": "prometheus", "uid": "atlas-vm"}
# --------------------------------------------------------------------------- #
# Panel helper factories
# --------------------------------------------------------------------------- #
def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None,
text_mode="value", legend=None):
defaults = {
"color": {"mode": "palette-classic"},
"mappings": [],
"thresholds": thresholds
or {
"mode": "absolute",
"steps": [
{"color": "rgba(115, 115, 115, 1)", "value": None},
{"color": "green", "value": 1},
],
},
"unit": unit,
}
panel = {
"id": panel_id,
"type": "stat",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": defaults, "overrides": []},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
"textMode": text_mode,
},
}
if legend:
panel["targets"][0]["legendFormat"] = legend
return panel
def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None,
legend_display="table", legend_placement="bottom",
legend_calcs=None, time_from=None):
panel = {
"id": panel_id,
"type": "timeseries",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
"options": {
"legend": {
"displayMode": legend_display,
"placement": legend_placement,
},
"tooltip": {"mode": "multi"},
},
}
if legend:
panel["targets"][0]["legendFormat"] = legend
if legend_calcs:
panel["options"]["legend"]["calcs"] = legend_calcs
if time_from:
panel["timeFrom"] = time_from
return panel
def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=None,
description=None):
panel = {
"id": panel_id,
"type": "table",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A"}],
"fieldConfig": {"defaults": {"unit": unit}, "overrides": []},
"options": {"showHeader": True},
}
if transformations:
panel["transformations"] = transformations
if description:
panel["description"] = description
return panel
def pie_panel(panel_id, title, expr, grid):
return {
"id": panel_id,
"type": "piechart",
"title": title,
"datasource": PROM_DS,
"gridPos": grid,
"targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}],
"fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []},
"options": {
"legend": {"displayMode": "list", "placement": "right"},
"pieType": "pie",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
},
}
def text_panel(panel_id, title, content, grid):
return {
"id": panel_id,
"type": "text",
"title": title,
"gridPos": grid,
"datasource": None,
"options": {"mode": "markdown", "content": content},
}
def node_cpu_expr(scope=""):
expr = "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))"
if scope:
expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}"
return expr
def node_mem_expr(scope=""):
expr = "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))"
if scope:
expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}"
return expr
def root_usage_expr():
return "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)"
def astreae_usage_expr(mount):
return (
f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / "
f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)"
)
def astreae_free_expr(mount):
return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})"
def build_overview():
thresholds_percent = {
"mode": "percentage",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 70},
{"color": "red", "value": 85},
],
}
panels = []
stats = [
(1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})'),
(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})'),
(3, "Control plane ready", 'sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})'),
(4, "Control plane schedulable", 'sum(kube_node_spec_unschedulable{node=~"titan-0a|titan-0b|titan-0c"} == 0)'),
(5, "Problem pods", 'sum(kube_pod_status_phase{phase!~"Running|Succeeded"})'),
(6, "Stuck terminating", 'sum(((time() - kube_pod_deletion_timestamp) > 600))'),
]
for idx, (panel_id, title, expr) in enumerate(stats):
panels.append(
stat_panel(
panel_id,
title,
expr,
{"h": 5, "w": 4, "x": 4 * idx, "y": 0},
)
)
panels.append(
stat_panel(
7,
"Hottest node: CPU",
node_cpu_expr(),
{"h": 5, "w": 4, "x": 24, "y": 0},
unit="percent",
thresholds=thresholds_percent,
text_mode="value_and_name",
legend="{{node}}",
)
)
panels.append(
stat_panel(
8,
"Hottest node: RAM",
node_mem_expr(),
{"h": 5, "w": 4, "x": 28, "y": 0},
unit="percent",
thresholds=thresholds_percent,
text_mode="value_and_name",
legend="{{node}}",
)
)
panels.append(pie_panel(9, "Namespace CPU share", 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 5}))
panels.append(pie_panel(10, "Namespace RAM share", 'topk(10, sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace))', {"h": 9, "w": 12, "x": 12, "y": 5}))
panels.append(
timeseries_panel(
11,
"Cluster node CPU",
node_cpu_expr(),
{"h": 8, "w": 12, "x": 0, "y": 14},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
12,
"Cluster node RAM",
node_mem_expr(),
{"h": 8, "w": 12, "x": 12, "y": 14},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
)
)
panels.append(
table_panel(
13,
"Problem pods (details)",
"(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
{"h": 8, "w": 12, "x": 0, "y": 22},
unit="s",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
14,
"Terminating >10m",
"(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
{"h": 8, "w": 12, "x": 12, "y": 22},
unit="s",
transformations=[
{"id": "labelsToFields", "options": {}} ,
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
],
)
)
panels.append(
timeseries_panel(
15,
"Control plane CPU",
node_cpu_expr("titan-0a|titan-0b|titan-0c"),
{"h": 7, "w": 12, "x": 0, "y": 30},
unit="percent",
legend="{{node}}",
)
)
panels.append(
timeseries_panel(
16,
"Control plane RAM",
node_mem_expr("titan-0a|titan-0b|titan-0c"),
{"h": 7, "w": 12, "x": 12, "y": 30},
unit="percent",
legend="{{node}}",
)
)
panels.append(
timeseries_panel(
17,
"Root filesystem usage",
root_usage_expr(),
{"h": 8, "w": 12, "x": 0, "y": 37},
unit="percent",
legend="{{node}}",
legend_calcs=["last"],
legend_display="table",
legend_placement="right",
time_from="7d",
)
)
panels.append(
{
"id": 18,
"type": "bargauge",
"title": "Nodes closest to full root disks",
"datasource": PROM_DS,
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 37},
"targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "percentage",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 50},
{"color": "orange", "value": 70},
{"color": "red", "value": 85},
],
},
},
"overrides": [],
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False},
},
}
)
panels.append(
stat_panel(
19,
"Astreae usage",
astreae_usage_expr("/mnt/astreae"),
{"h": 6, "w": 6, "x": 0, "y": 45},
unit="percent",
thresholds=thresholds_percent,
)
)
panels.append(
stat_panel(
20,
"Asteria usage",
astreae_usage_expr("/mnt/asteria"),
{"h": 6, "w": 6, "x": 6, "y": 45},
unit="percent",
thresholds=thresholds_percent,
)
)
panels.append(
stat_panel(
21,
"Astreae free",
astreae_free_expr("/mnt/astreae"),
{"h": 6, "w": 6, "x": 12, "y": 45},
unit="bytesSI",
)
)
panels.append(
stat_panel(
22,
"Asteria free",
astreae_free_expr("/mnt/asteria"),
{"h": 6, "w": 6, "x": 18, "y": 45},
unit="bytesSI",
)
)
panels.append(
table_panel(
23,
"Astreae per-node usage",
'100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)',
{"h": 8, "w": 12, "x": 0, "y": 51},
unit="percent",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
24,
"Asteria per-node usage",
'100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)',
{"h": 8, "w": 12, "x": 12, "y": 51},
unit="percent",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
text_panel(
25,
"About this dashboard",
"### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders",
{"h": 5, "w": 24, "x": 0, "y": 59},
)
)
return {
"uid": "atlas-overview",
"title": "Atlas Overview",
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {"type": "datasource", "uid": "grafana"},
"enable": True,
"hide": True,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard",
}
]
},
"editable": False,
"folderUid": "atlas-overview",
"graphTooltip": 0,
"links": [
{"title": "Pods dashboard", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False},
{"title": "Nodes dashboard", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False},
{"title": "Storage dashboard", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False},
],
"panels": panels,
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "overview"],
"templating": {"list": []},
"time": {"from": "now-12h", "to": "now"},
}
def build_pods_dashboard():
panels = []
panels.append(
table_panel(
1,
"Pods not running",
"(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
{"h": 10, "w": 24, "x": 0, "y": 0},
unit="s",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
2,
"CrashLoop / ImagePull",
"(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
{"h": 10, "w": 24, "x": 0, "y": 10},
unit="s",
transformations=[{"id": "labelsToFields", "options": {}}],
)
)
panels.append(
table_panel(
3,
"Terminating pods",
"(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
{"h": 10, "w": 24, "x": 0, "y": 20},
unit="s",
transformations=[
{"id": "labelsToFields", "options": {}} ,
{"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}},
],
)
)
return {
"uid": "atlas-pods",
"title": "Atlas Pods",
"folderUid": "atlas-pods",
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "pods"],
}
def build_nodes_dashboard():
panels = []
panels.append(stat_panel(1, "Node count", 'count(kube_node_info)', {"h": 5, "w": 6, "x": 0, "y": 0}))
panels.append(stat_panel(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})', {"h": 5, "w": 6, "x": 6, "y": 0}))
panels.append(stat_panel(3, "Control plane CPU avg", node_cpu_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name"))
panels.append(stat_panel(4, "Control plane RAM avg", node_mem_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name"))
panels.append(timeseries_panel(5, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right"))
panels.append(timeseries_panel(6, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 14}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right"))
panels.append(timeseries_panel(7, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 23}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="7d"))
return {
"uid": "atlas-nodes",
"title": "Atlas Nodes",
"folderUid": "atlas-nodes",
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "nodes"],
}
def build_storage_dashboard():
panels = []
panels.append(stat_panel(1, "Astreae usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent"))
panels.append(stat_panel(2, "Asteria usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent"))
panels.append(stat_panel(3, "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="bytesSI"))
panels.append(stat_panel(4, "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="bytesSI"))
panels.append(timeseries_panel(5, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d"))
panels.append(table_panel(6, "Astreae nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 0, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}]))
panels.append(table_panel(7, "Asteria nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 12, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}]))
return {
"uid": "atlas-storage",
"title": "Atlas Storage",
"folderUid": "atlas-storage",
"editable": True,
"panels": panels,
"time": {"from": "now-12h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "storage"],
}
DASHBOARDS = {
"atlas-overview": {
"builder": build_overview,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml",
},
"atlas-pods": {
"builder": build_pods_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml",
},
"atlas-nodes": {
"builder": build_nodes_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml",
},
"atlas-storage": {
"builder": build_storage_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml",
},
}
def write_json(uid: str, data: dict) -> None:
DASHBOARD_DIR.mkdir(parents=True, exist_ok=True)
path = DASHBOARD_DIR / f"{uid}.json"
path.write_text(json.dumps(data, indent=2) + "\n")
def render_configmap(uid: str, data: dict) -> None:
json_path = DASHBOARD_DIR / f"{uid}.json"
payload = json.dumps(json.loads(json_path.read_text()), indent=2)
indented = "\n".join(" " + line for line in payload.splitlines())
output_path = data["configmap"]
content = CONFIG_TEMPLATE.format(
relative_path=output_path.relative_to(ROOT),
name=output_path.stem,
key=json_path.name,
payload=indented,
)
output_path.write_text(content)
print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}")
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders")
args = parser.parse_args()
if args.build:
for uid, info in DASHBOARDS.items():
write_json(uid, info["builder"]())
for uid, info in DASHBOARDS.items():
render_configmap(uid, info)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,369 @@
{
"uid": "atlas-nodes",
"title": "Atlas Nodes",
"folderUid": "atlas-nodes",
"editable": true,
"panels": [
{
"id": 1,
"type": "stat",
"title": "Node count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "count(kube_node_info)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "none"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 2,
"type": "stat",
"title": "Ready nodes",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "none"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 3,
"type": "stat",
"title": "Control plane CPU avg",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "percent"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value_and_name"
}
},
{
"id": 4,
"type": "stat",
"title": "Control plane RAM avg",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "percent"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value_and_name"
}
},
{
"id": 5,
"type": "timeseries",
"title": "Node CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 5
},
"targets": [
{
"expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 6,
"type": "timeseries",
"title": "Node RAM",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 14
},
"targets": [
{
"expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right",
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 7,
"type": "timeseries",
"title": "Root filesystem",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 23
},
"targets": [
{
"expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "7d"
}
],
"time": {
"from": "now-12h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"nodes"
]
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,137 @@
{
"uid": "atlas-pods",
"title": "Atlas Pods",
"folderUid": "atlas-pods",
"editable": true,
"panels": [
{
"id": 1,
"type": "table",
"title": "Pods not running",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
},
{
"id": 2,
"type": "table",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 10
},
"targets": [
{
"expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
},
{
"id": 3,
"type": "table",
"title": "Terminating pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 20
},
"targets": [
{
"expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "filterByValue",
"options": {
"match": "Value",
"operator": "gt",
"value": 600
}
}
]
}
],
"time": {
"from": "now-12h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"pods"
]
}

View File

@ -0,0 +1,359 @@
{
"uid": "atlas-storage",
"title": "Atlas Storage",
"folderUid": "atlas-storage",
"editable": true,
"panels": [
{
"id": 1,
"type": "stat",
"title": "Astreae usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "percent"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 2,
"type": "stat",
"title": "Asteria usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "percent"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 3,
"type": "stat",
"title": "Astreae free",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "bytesSI"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Asteria free",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "bytesSI"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "timeseries",
"title": "Root filesystem",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 5
},
"targets": [
{
"expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
},
{
"id": 6,
"type": "table",
"title": "Astreae nodes",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 14
},
"targets": [
{
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
},
{
"id": 7,
"type": "table",
"title": "Asteria nodes",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 14
},
"targets": [
{
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
}
],
"time": {
"from": "now-12h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"storage"
]
}

View File

@ -1,38 +1,22 @@
# services/monitoring/grafana-dashboard-sre.yaml
# services/monitoring/grafana-dashboard-nodes.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-sre
name: grafana-dashboard-nodes
labels:
grafana_dashboard: "1"
data:
atlas-sre-overview.json: |
atlas-nodes.json: |
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"uid": "atlas-nodes",
"title": "Atlas Nodes",
"folderUid": "atlas-nodes",
"editable": true,
"folderUid": "atlas-sre",
"graphTooltip": 0,
"links": [],
"panels": [
{
"id": 1,
"type": "stat",
"title": "Ready nodes",
"title": "Node count",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -45,7 +29,7 @@ data:
},
"targets": [
{
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / count(kube_node_info) * 100",
"expr": "count(kube_node_info)",
"refId": "A"
}
],
@ -56,23 +40,19 @@ data:
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"mode": "absolute",
"steps": [
{
"color": "red",
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "yellow",
"value": 95
},
{
"color": "green",
"value": 99
"value": 1
}
]
},
"unit": "percent"
"unit": "none"
},
"overrides": []
},
@ -93,7 +73,7 @@ data:
{
"id": 2,
"type": "stat",
"title": "Pending pods",
"title": "Ready nodes",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -106,7 +86,7 @@ data:
},
"targets": [
{
"expr": "sum(kube_pod_status_phase{phase=\"Pending\"})",
"expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
"refId": "A"
}
],
@ -120,16 +100,12 @@ data:
"mode": "absolute",
"steps": [
{
"color": "green",
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "yellow",
"value": 3
},
{
"color": "red",
"value": 10
"color": "green",
"value": 1
}
]
},
@ -154,7 +130,7 @@ data:
{
"id": 3,
"type": "stat",
"title": "Unavailable deployment replicas",
"title": "Control plane CPU avg",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -167,8 +143,9 @@ data:
},
"targets": [
{
"expr": "sum(kube_deployment_status_replicas_unavailable)",
"refId": "A"
"expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
@ -181,20 +158,16 @@ data:
"mode": "absolute",
"steps": [
{
"color": "green",
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "yellow",
"color": "green",
"value": 1
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none"
"unit": "percent"
},
"overrides": []
},
@ -209,13 +182,13 @@ data:
"fields": "",
"values": false
},
"textMode": "value"
"textMode": "value_and_name"
}
},
{
"id": 4,
"type": "stat",
"title": "Active alerts",
"title": "Control plane RAM avg",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
@ -228,8 +201,9 @@ data:
},
"targets": [
{
"expr": "sum(ALERTS{alertstate=\"firing\"})",
"refId": "A"
"expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
@ -242,20 +216,16 @@ data:
"mode": "absolute",
"steps": [
{
"color": "green",
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "yellow",
"color": "green",
"value": 1
},
{
"color": "red",
"value": 3
}
]
},
"unit": "none"
"unit": "percent"
},
"overrides": []
},
@ -270,20 +240,20 @@ data:
"fields": "",
"values": false
},
"textMode": "value"
"textMode": "value_and_name"
}
},
{
"id": 5,
"type": "timeseries",
"title": "Node CPU usage",
"title": "Node CPU",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"w": 24,
"x": 0,
"y": 5
},
@ -303,7 +273,10 @@ data:
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom"
"placement": "right",
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
@ -313,16 +286,16 @@ data:
{
"id": 6,
"type": "timeseries",
"title": "Node memory usage",
"title": "Node RAM",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 5
"w": 24,
"x": 0,
"y": 14
},
"targets": [
{
@ -340,7 +313,10 @@ data:
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom"
"placement": "right",
"calcs": [
"last"
]
},
"tooltip": {
"mode": "multi"
@ -350,201 +326,22 @@ data:
{
"id": 7,
"type": "timeseries",
"title": "Top pod CPU (5m avg)",
"title": "Root filesystem",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 14
},
"targets": [
{
"expr": "topk(5, sum(rate(container_cpu_usage_seconds_total{pod!=\"\",container!=\"\"}[5m])) by (namespace,pod))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pod}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "cores"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 8,
"type": "timeseries",
"title": "Top pod memory working set",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 14
},
"targets": [
{
"expr": "topk(5, sum(container_memory_working_set_bytes{pod!=\"\",container!=\"\"}) by (namespace,pod))",
"refId": "A",
"legendFormat": "{{namespace}}/{{pod}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "bytes"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 9,
"type": "bargauge",
"title": "Namespace restart rate (6h)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"w": 24,
"x": 0,
"y": 23
},
"targets": [
{
"expr": "topk(8, sum(increase(kube_pod_container_status_restarts_total{namespace!=\"\"}[6h])) by (namespace))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "none"
},
"overrides": []
},
"options": {
"displayMode": "gradient",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
}
}
},
{
"id": 10,
"type": "table",
"title": "Deployments missing replicas",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 23
},
"targets": [
{
"expr": "topk(10, sum by (namespace,deployment) (kube_deployment_status_replicas_unavailable))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "none"
},
"overrides": []
},
"options": {
"showHeader": true
}
},
{
"id": 11,
"type": "timeseries",
"title": "Pod phase breakdown",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 31
},
"targets": [
{
"expr": "sum(kube_pod_status_phase) by (phase)",
"expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
"refId": "A",
"legendFormat": "{{phase}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "none"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 12,
"type": "timeseries",
"title": "PVC usage (top 8)",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 31
},
"targets": [
{
"expr": "topk(8, sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))",
"refId": "A",
"legendFormat": "{{namespace}}/{{persistentvolumeclaim}}"
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
@ -556,28 +353,26 @@ data:
"options": {
"legend": {
"displayMode": "table",
"placement": "bottom"
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
"timeFrom": "7d"
}
],
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"sre"
],
"templating": {
"list": []
},
"time": {
"from": "now-12h",
"to": "now"
},
"title": "Atlas SRE Overview",
"uid": "atlas-sre",
"version": 4
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"nodes"
]
}

View File

@ -0,0 +1,146 @@
# services/monitoring/grafana-dashboard-pods.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-pods
labels:
grafana_dashboard: "1"
data:
atlas-pods.json: |
{
"uid": "atlas-pods",
"title": "Atlas Pods",
"folderUid": "atlas-pods",
"editable": true,
"panels": [
{
"id": 1,
"type": "table",
"title": "Pods not running",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
},
{
"id": 2,
"type": "table",
"title": "CrashLoop / ImagePull",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 10
},
"targets": [
{
"expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
},
{
"id": 3,
"type": "table",
"title": "Terminating pods",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 20
},
"targets": [
{
"expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "filterByValue",
"options": {
"match": "Value",
"operator": "gt",
"value": 600
}
}
]
}
],
"time": {
"from": "now-12h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"pods"
]
}

View File

@ -0,0 +1,368 @@
# services/monitoring/grafana-dashboard-storage.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-storage
labels:
grafana_dashboard: "1"
data:
atlas-storage.json: |
{
"uid": "atlas-storage",
"title": "Atlas Storage",
"folderUid": "atlas-storage",
"editable": true,
"panels": [
{
"id": 1,
"type": "stat",
"title": "Astreae usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "percent"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 2,
"type": "stat",
"title": "Asteria usage",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "percent"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 3,
"type": "stat",
"title": "Astreae free",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "bytesSI"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Asteria free",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 5,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "bytesSI"
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "timeseries",
"title": "Root filesystem",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 5
},
"targets": [
{
"expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)",
"refId": "A",
"legendFormat": "{{node}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
},
"timeFrom": "30d"
},
{
"id": 6,
"type": "table",
"title": "Astreae nodes",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 0,
"y": 14
},
"targets": [
{
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
},
{
"id": 7,
"type": "table",
"title": "Asteria nodes",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 10,
"w": 12,
"x": 12,
"y": 14
},
"targets": [
{
"expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent"
},
"overrides": []
},
"options": {
"showHeader": true
},
"transformations": [
{
"id": "labelsToFields",
"options": {}
}
]
}
],
"time": {
"from": "now-12h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"storage"
]
}

View File

@ -10,8 +10,8 @@ data:
folders.yaml: |
apiVersion: 1
folders:
- uid: atlas-public
title: Atlas Public
- uid: atlas-overview
title: Atlas Overview
permissions:
- role: Viewer
permission: View
@ -19,8 +19,22 @@ data:
permission: Edit
- role: Admin
permission: Admin
- uid: atlas-sre
title: Atlas SRE
- uid: atlas-pods
title: Atlas Pods
permissions:
- role: Editor
permission: View
- role: Admin
permission: Admin
- uid: atlas-nodes
title: Atlas Nodes
permissions:
- role: Editor
permission: View
- role: Admin
permission: Admin
- uid: atlas-storage
title: Atlas Storage
permissions:
- role: Editor
permission: View

View File

@ -244,8 +244,8 @@ spec:
GF_SECURITY_ALLOW_EMBEDDING: "true"
grafana.ini:
server:
domain: atlas.metrics.bstein.dev
root_url: https://atlas.metrics.bstein.dev/
domain: metrics.bstein.dev
root_url: https://metrics.bstein.dev/
auth.anonymous:
hide_version: true
users:
@ -256,12 +256,12 @@ spec:
annotations:
cert-manager.io/cluster-issuer: letsencrypt
hosts:
- atlas.metrics.bstein.dev
- metrics.bstein.dev
path: /
tls:
- secretName: grafana-atlas-metrics-tls
- secretName: grafana-metrics-tls
hosts:
- atlas.metrics.bstein.dev
- metrics.bstein.dev
datasources:
datasources.yaml:
apiVersion: 1
@ -278,25 +278,43 @@ spec:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: public
- name: overview
orgId: 1
folder: Atlas Public
folder: Atlas Overview
type: file
disableDeletion: false
editable: false
options:
path: /var/lib/grafana/dashboards/public
- name: sre
path: /var/lib/grafana/dashboards/overview
- name: pods
orgId: 1
folder: Atlas SRE
folder: Atlas Pods
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/sre
path: /var/lib/grafana/dashboards/pods
- name: nodes
orgId: 1
folder: Atlas Nodes
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/nodes
- name: storage
orgId: 1
folder: Atlas Storage
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/storage
dashboardsConfigMaps:
public: grafana-dashboard-public
sre: grafana-dashboard-sre
overview: grafana-dashboard-overview
pods: grafana-dashboard-pods
nodes: grafana-dashboard-nodes
storage: grafana-dashboard-storage
extraConfigmapMounts:
- name: grafana-folders
mountPath: /etc/grafana/provisioning/folders
@ -327,14 +345,14 @@ spec:
annotations:
cert-manager.io/cluster-issuer: letsencrypt
hosts:
- host: atlas.alerts.bstein.dev
- host: alerts.bstein.dev
paths:
- path: /
pathType: Prefix
tls:
- secretName: alerts-bstein-dev-tls
hosts:
- atlas.alerts.bstein.dev
- alerts.bstein.dev
config:
global:
resolve_timeout: 5m

View File

@ -5,7 +5,9 @@ namespace: monitoring
resources:
- namespace.yaml
- rbac.yaml
- grafana-dashboard-public.yaml
- grafana-dashboard-sre.yaml
- grafana-dashboard-overview.yaml
- grafana-dashboard-pods.yaml
- grafana-dashboard-nodes.yaml
- grafana-dashboard-storage.yaml
- grafana-folders.yaml
- helmrelease.yaml