From a41f25e66d7bbc02ea3fb287920f4eb4bfda686d Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Mon, 17 Nov 2025 14:22:46 -0300 Subject: [PATCH] monitoring: restructure grafana dashboards --- scripts/render_dashboards.py | 605 ++++++++ .../monitoring/dashboards/atlas-nodes.json | 369 +++++ .../monitoring/dashboards/atlas-overview.json | 1270 +++++++++++++++++ .../monitoring/dashboards/atlas-pods.json | 137 ++ .../monitoring/dashboards/atlas-storage.json | 359 +++++ ...-sre.yaml => grafana-dashboard-nodes.yaml} | 331 +---- ...c.yaml => grafana-dashboard-overview.yaml} | 716 ++++++---- .../monitoring/grafana-dashboard-pods.yaml | 146 ++ .../monitoring/grafana-dashboard-storage.yaml | 368 +++++ services/monitoring/grafana-folders.yaml | 22 +- services/monitoring/helmrelease.yaml | 48 +- services/monitoring/kustomization.yaml | 6 +- 12 files changed, 3847 insertions(+), 530 deletions(-) create mode 100755 scripts/render_dashboards.py create mode 100644 services/monitoring/dashboards/atlas-nodes.json create mode 100644 services/monitoring/dashboards/atlas-overview.json create mode 100644 services/monitoring/dashboards/atlas-pods.json create mode 100644 services/monitoring/dashboards/atlas-storage.json rename services/monitoring/{grafana-dashboard-sre.yaml => grafana-dashboard-nodes.yaml} (53%) rename services/monitoring/{grafana-dashboard-public.yaml => grafana-dashboard-overview.yaml} (67%) create mode 100644 services/monitoring/grafana-dashboard-pods.yaml create mode 100644 services/monitoring/grafana-dashboard-storage.yaml diff --git a/scripts/render_dashboards.py b/scripts/render_dashboards.py new file mode 100755 index 0000000..fa9ef58 --- /dev/null +++ b/scripts/render_dashboards.py @@ -0,0 +1,605 @@ +#!/usr/bin/env python3 +"""Generate Grafana dashboards and render them into ConfigMaps. + +Usage: + python scripts/render_dashboards.py --build # rebuild JSON + ConfigMaps + python scripts/render_dashboards.py # just render ConfigMaps +""" +import argparse +import json +import textwrap +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +DASHBOARD_DIR = ROOT / "services" / "monitoring" / "dashboards" +CONFIG_TEMPLATE = textwrap.dedent( + """# {relative_path} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {name} + labels: + grafana_dashboard: "1" +data: + {key}: | +{payload} +""" +) + +PROM_DS = {"type": "prometheus", "uid": "atlas-vm"} + + +# --------------------------------------------------------------------------- # +# Panel helper factories +# --------------------------------------------------------------------------- # + + +def stat_panel(panel_id, title, expr, grid, *, unit="none", thresholds=None, + text_mode="value", legend=None): + defaults = { + "color": {"mode": "palette-classic"}, + "mappings": [], + "thresholds": thresholds + or { + "mode": "absolute", + "steps": [ + {"color": "rgba(115, 115, 115, 1)", "value": None}, + {"color": "green", "value": 1}, + ], + }, + "unit": unit, + } + panel = { + "id": panel_id, + "type": "stat", + "title": title, + "datasource": PROM_DS, + "gridPos": grid, + "targets": [{"expr": expr, "refId": "A"}], + "fieldConfig": {"defaults": defaults, "overrides": []}, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, + "textMode": text_mode, + }, + } + if legend: + panel["targets"][0]["legendFormat"] = legend + return panel + + +def timeseries_panel(panel_id, title, expr, grid, *, unit="none", legend=None, + legend_display="table", legend_placement="bottom", + legend_calcs=None, time_from=None): + panel = { + "id": panel_id, + "type": "timeseries", + "title": title, + "datasource": PROM_DS, + "gridPos": grid, + "targets": [{"expr": expr, "refId": "A"}], + "fieldConfig": {"defaults": {"unit": unit}, "overrides": []}, + "options": { + "legend": { + "displayMode": legend_display, + "placement": legend_placement, + }, + "tooltip": {"mode": "multi"}, + }, + } + if legend: + panel["targets"][0]["legendFormat"] = legend + if legend_calcs: + panel["options"]["legend"]["calcs"] = legend_calcs + if time_from: + panel["timeFrom"] = time_from + return panel + + +def table_panel(panel_id, title, expr, grid, *, unit="none", transformations=None, + description=None): + panel = { + "id": panel_id, + "type": "table", + "title": title, + "datasource": PROM_DS, + "gridPos": grid, + "targets": [{"expr": expr, "refId": "A"}], + "fieldConfig": {"defaults": {"unit": unit}, "overrides": []}, + "options": {"showHeader": True}, + } + if transformations: + panel["transformations"] = transformations + if description: + panel["description"] = description + return panel + + +def pie_panel(panel_id, title, expr, grid): + return { + "id": panel_id, + "type": "piechart", + "title": title, + "datasource": PROM_DS, + "gridPos": grid, + "targets": [{"expr": expr, "refId": "A", "legendFormat": "{{namespace}}"}], + "fieldConfig": {"defaults": {"unit": "percent"}, "overrides": []}, + "options": { + "legend": {"displayMode": "list", "placement": "right"}, + "pieType": "pie", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, + }, + } + + +def text_panel(panel_id, title, content, grid): + return { + "id": panel_id, + "type": "text", + "title": title, + "gridPos": grid, + "datasource": None, + "options": {"mode": "markdown", "content": content}, + } + + +def node_cpu_expr(scope=""): + expr = "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))" + if scope: + expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}" + return expr + + +def node_mem_expr(scope=""): + expr = "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))" + if scope: + expr = f"{expr} * on(node) group_left() kube_node_info{{node=~\"{scope}\"}}" + return expr + + +def root_usage_expr(): + return "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)" + + +def astreae_usage_expr(mount): + return ( + f"100 - (sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) / " + f"sum(node_filesystem_size_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}}) * 100)" + ) + + +def astreae_free_expr(mount): + return f"sum(node_filesystem_avail_bytes{{mountpoint=\"{mount}\",fstype!~\"tmpfs|overlay\"}})" + + +def build_overview(): + thresholds_percent = { + "mode": "percentage", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 85}, + ], + } + panels = [] + stats = [ + (1, "Running pods", 'sum(kube_pod_status_phase{phase="Running"})'), + (2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})'), + (3, "Control plane ready", 'sum(kube_node_status_condition{condition="Ready",status="true",node=~"titan-0a|titan-0b|titan-0c"})'), + (4, "Control plane schedulable", 'sum(kube_node_spec_unschedulable{node=~"titan-0a|titan-0b|titan-0c"} == 0)'), + (5, "Problem pods", 'sum(kube_pod_status_phase{phase!~"Running|Succeeded"})'), + (6, "Stuck terminating", 'sum(((time() - kube_pod_deletion_timestamp) > 600))'), + ] + for idx, (panel_id, title, expr) in enumerate(stats): + panels.append( + stat_panel( + panel_id, + title, + expr, + {"h": 5, "w": 4, "x": 4 * idx, "y": 0}, + ) + ) + panels.append( + stat_panel( + 7, + "Hottest node: CPU", + node_cpu_expr(), + {"h": 5, "w": 4, "x": 24, "y": 0}, + unit="percent", + thresholds=thresholds_percent, + text_mode="value_and_name", + legend="{{node}}", + ) + ) + panels.append( + stat_panel( + 8, + "Hottest node: RAM", + node_mem_expr(), + {"h": 5, "w": 4, "x": 28, "y": 0}, + unit="percent", + thresholds=thresholds_percent, + text_mode="value_and_name", + legend="{{node}}", + ) + ) + + panels.append(pie_panel(9, "Namespace CPU share", 'topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!="",pod!="",container!=""}[5m])) by (namespace))', {"h": 9, "w": 12, "x": 0, "y": 5})) + panels.append(pie_panel(10, "Namespace RAM share", 'topk(10, sum(container_memory_working_set_bytes{namespace!="",pod!="",container!=""}) by (namespace))', {"h": 9, "w": 12, "x": 12, "y": 5})) + + panels.append( + timeseries_panel( + 11, + "Cluster node CPU", + node_cpu_expr(), + {"h": 8, "w": 12, "x": 0, "y": 14}, + unit="percent", + legend="{{node}}", + legend_calcs=["last"], + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 12, + "Cluster node RAM", + node_mem_expr(), + {"h": 8, "w": 12, "x": 12, "y": 14}, + unit="percent", + legend="{{node}}", + legend_calcs=["last"], + legend_display="table", + legend_placement="right", + ) + ) + + panels.append( + table_panel( + 13, + "Problem pods (details)", + "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + {"h": 8, "w": 12, "x": 0, "y": 22}, + unit="s", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + table_panel( + 14, + "Terminating >10m", + "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + {"h": 8, "w": 12, "x": 12, "y": 22}, + unit="s", + transformations=[ + {"id": "labelsToFields", "options": {}} , + {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, + ], + ) + ) + + panels.append( + timeseries_panel( + 15, + "Control plane CPU", + node_cpu_expr("titan-0a|titan-0b|titan-0c"), + {"h": 7, "w": 12, "x": 0, "y": 30}, + unit="percent", + legend="{{node}}", + ) + ) + panels.append( + timeseries_panel( + 16, + "Control plane RAM", + node_mem_expr("titan-0a|titan-0b|titan-0c"), + {"h": 7, "w": 12, "x": 12, "y": 30}, + unit="percent", + legend="{{node}}", + ) + ) + + panels.append( + timeseries_panel( + 17, + "Root filesystem usage", + root_usage_expr(), + {"h": 8, "w": 12, "x": 0, "y": 37}, + unit="percent", + legend="{{node}}", + legend_calcs=["last"], + legend_display="table", + legend_placement="right", + time_from="7d", + ) + ) + + panels.append( + { + "id": 18, + "type": "bargauge", + "title": "Nodes closest to full root disks", + "datasource": PROM_DS, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 37}, + "targets": [{"expr": f"topk(8, {root_usage_expr()})", "refId": "A", "legendFormat": "{{node}}"}], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "percentage", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 50}, + {"color": "orange", "value": 70}, + {"color": "red", "value": 85}, + ], + }, + }, + "overrides": [], + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": False}, + }, + } + ) + + panels.append( + stat_panel( + 19, + "Astreae usage", + astreae_usage_expr("/mnt/astreae"), + {"h": 6, "w": 6, "x": 0, "y": 45}, + unit="percent", + thresholds=thresholds_percent, + ) + ) + panels.append( + stat_panel( + 20, + "Asteria usage", + astreae_usage_expr("/mnt/asteria"), + {"h": 6, "w": 6, "x": 6, "y": 45}, + unit="percent", + thresholds=thresholds_percent, + ) + ) + panels.append( + stat_panel( + 21, + "Astreae free", + astreae_free_expr("/mnt/astreae"), + {"h": 6, "w": 6, "x": 12, "y": 45}, + unit="bytesSI", + ) + ) + panels.append( + stat_panel( + 22, + "Asteria free", + astreae_free_expr("/mnt/asteria"), + {"h": 6, "w": 6, "x": 18, "y": 45}, + unit="bytesSI", + ) + ) + + panels.append( + table_panel( + 23, + "Astreae per-node usage", + '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', + {"h": 8, "w": 12, "x": 0, "y": 51}, + unit="percent", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + table_panel( + 24, + "Asteria per-node usage", + '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', + {"h": 8, "w": 12, "x": 12, "y": 51}, + unit="percent", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + + panels.append( + text_panel( + 25, + "About this dashboard", + "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders", + {"h": 5, "w": 24, "x": 0, "y": 59}, + ) + ) + + return { + "uid": "atlas-overview", + "title": "Atlas Overview", + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": {"type": "datasource", "uid": "grafana"}, + "enable": True, + "hide": True, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard", + } + ] + }, + "editable": False, + "folderUid": "atlas-overview", + "graphTooltip": 0, + "links": [ + {"title": "Pods dashboard", "type": "dashboard", "dashboardUid": "atlas-pods", "keepTime": False}, + {"title": "Nodes dashboard", "type": "dashboard", "dashboardUid": "atlas-nodes", "keepTime": False}, + {"title": "Storage dashboard", "type": "dashboard", "dashboardUid": "atlas-storage", "keepTime": False}, + ], + "panels": panels, + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "overview"], + "templating": {"list": []}, + "time": {"from": "now-12h", "to": "now"}, + } + + +def build_pods_dashboard(): + panels = [] + panels.append( + table_panel( + 1, + "Pods not running", + "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + {"h": 10, "w": 24, "x": 0, "y": 0}, + unit="s", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + table_panel( + 2, + "CrashLoop / ImagePull", + "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", + {"h": 10, "w": 24, "x": 0, "y": 10}, + unit="s", + transformations=[{"id": "labelsToFields", "options": {}}], + ) + ) + panels.append( + table_panel( + 3, + "Terminating pods", + "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + {"h": 10, "w": 24, "x": 0, "y": 20}, + unit="s", + transformations=[ + {"id": "labelsToFields", "options": {}} , + {"id": "filterByValue", "options": {"match": "Value", "operator": "gt", "value": 600}}, + ], + ) + ) + return { + "uid": "atlas-pods", + "title": "Atlas Pods", + "folderUid": "atlas-pods", + "editable": True, + "panels": panels, + "time": {"from": "now-12h", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "pods"], + } + + +def build_nodes_dashboard(): + panels = [] + panels.append(stat_panel(1, "Node count", 'count(kube_node_info)', {"h": 5, "w": 6, "x": 0, "y": 0})) + panels.append(stat_panel(2, "Ready nodes", 'sum(kube_node_status_condition{condition="Ready",status="true"})', {"h": 5, "w": 6, "x": 6, "y": 0})) + panels.append(stat_panel(3, "Control plane CPU avg", node_cpu_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name")) + panels.append(stat_panel(4, "Control plane RAM avg", node_mem_expr("titan-0a|titan-0b|titan-0c"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="percent", legend="{{node}}", text_mode="value_and_name")) + panels.append(timeseries_panel(5, "Node CPU", node_cpu_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right")) + panels.append(timeseries_panel(6, "Node RAM", node_mem_expr(), {"h": 9, "w": 24, "x": 0, "y": 14}, unit="percent", legend="{{node}}", legend_calcs=["last"], legend_display="table", legend_placement="right")) + panels.append(timeseries_panel(7, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 23}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="7d")) + return { + "uid": "atlas-nodes", + "title": "Atlas Nodes", + "folderUid": "atlas-nodes", + "editable": True, + "panels": panels, + "time": {"from": "now-12h", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "nodes"], + } + + +def build_storage_dashboard(): + panels = [] + panels.append(stat_panel(1, "Astreae usage", astreae_usage_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 0, "y": 0}, unit="percent")) + panels.append(stat_panel(2, "Asteria usage", astreae_usage_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 6, "y": 0}, unit="percent")) + panels.append(stat_panel(3, "Astreae free", astreae_free_expr("/mnt/astreae"), {"h": 5, "w": 6, "x": 12, "y": 0}, unit="bytesSI")) + panels.append(stat_panel(4, "Asteria free", astreae_free_expr("/mnt/asteria"), {"h": 5, "w": 6, "x": 18, "y": 0}, unit="bytesSI")) + panels.append(timeseries_panel(5, "Root filesystem", root_usage_expr(), {"h": 9, "w": 24, "x": 0, "y": 5}, unit="percent", legend="{{node}}", legend_display="table", legend_placement="right", time_from="30d")) + panels.append(table_panel(6, "Astreae nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/astreae",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 0, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}])) + panels.append(table_panel(7, "Asteria nodes", '100 - (node_filesystem_avail_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{mountpoint="/mnt/asteria",fstype!~"tmpfs|overlay"} * 100)', {"h": 10, "w": 12, "x": 12, "y": 14}, unit="percent", transformations=[{"id": "labelsToFields", "options": {}}])) + return { + "uid": "atlas-storage", + "title": "Atlas Storage", + "folderUid": "atlas-storage", + "editable": True, + "panels": panels, + "time": {"from": "now-12h", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "storage"], + } + + +DASHBOARDS = { + "atlas-overview": { + "builder": build_overview, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-overview.yaml", + }, + "atlas-pods": { + "builder": build_pods_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-pods.yaml", + }, + "atlas-nodes": { + "builder": build_nodes_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-nodes.yaml", + }, + "atlas-storage": { + "builder": build_storage_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-storage.yaml", + }, +} + + +def write_json(uid: str, data: dict) -> None: + DASHBOARD_DIR.mkdir(parents=True, exist_ok=True) + path = DASHBOARD_DIR / f"{uid}.json" + path.write_text(json.dumps(data, indent=2) + "\n") + + +def render_configmap(uid: str, data: dict) -> None: + json_path = DASHBOARD_DIR / f"{uid}.json" + payload = json.dumps(json.loads(json_path.read_text()), indent=2) + indented = "\n".join(" " + line for line in payload.splitlines()) + output_path = data["configmap"] + content = CONFIG_TEMPLATE.format( + relative_path=output_path.relative_to(ROOT), + name=output_path.stem, + key=json_path.name, + payload=indented, + ) + output_path.write_text(content) + print(f"Rendered {json_path.name} -> {output_path.relative_to(ROOT)}") + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--build", action="store_true", help="Regenerate dashboard JSON files from builders") + args = parser.parse_args() + + if args.build: + for uid, info in DASHBOARDS.items(): + write_json(uid, info["builder"]()) + + for uid, info in DASHBOARDS.items(): + render_configmap(uid, info) + + +if __name__ == "__main__": + main() diff --git a/services/monitoring/dashboards/atlas-nodes.json b/services/monitoring/dashboards/atlas-nodes.json new file mode 100644 index 0000000..d3393a9 --- /dev/null +++ b/services/monitoring/dashboards/atlas-nodes.json @@ -0,0 +1,369 @@ +{ + "uid": "atlas-nodes", + "title": "Atlas Nodes", + "folderUid": "atlas-nodes", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Node count", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "count(kube_node_info)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Ready nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Control plane CPU avg", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 4, + "type": "stat", + "title": "Control plane RAM avg", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 5, + "type": "timeseries", + "title": "Node CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 5 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 6, + "type": "timeseries", + "title": "Node RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 7, + "type": "timeseries", + "title": "Root filesystem", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 23 + }, + "targets": [ + { + "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "7d" + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "nodes" + ] +} diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json new file mode 100644 index 0000000..d7a0d27 --- /dev/null +++ b/services/monitoring/dashboards/atlas-overview.json @@ -0,0 +1,1270 @@ +{ + "uid": "atlas-overview", + "title": "Atlas Overview", + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": false, + "folderUid": "atlas-overview", + "graphTooltip": 0, + "links": [ + { + "title": "Pods dashboard", + "type": "dashboard", + "dashboardUid": "atlas-pods", + "keepTime": false + }, + { + "title": "Nodes dashboard", + "type": "dashboard", + "dashboardUid": "atlas-nodes", + "keepTime": false + }, + { + "title": "Storage dashboard", + "type": "dashboard", + "dashboardUid": "atlas-storage", + "keepTime": false + } + ], + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Running pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_status_phase{phase=\"Running\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Ready nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Control plane ready", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Control plane schedulable", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Problem pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Stuck terminating", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "targets": [ + { + "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Hottest node: CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 24, + "y": 0 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 8, + "type": "stat", + "title": "Hottest node: RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 28, + "y": 0 + }, + "targets": [ + { + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 9, + "type": "piechart", + "title": "Namespace CPU share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 5 + }, + "targets": [ + { + "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 10, + "type": "piechart", + "title": "Namespace RAM share", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "targets": [ + { + "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", + "refId": "A", + "legendFormat": "{{namespace}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 11, + "type": "timeseries", + "title": "Cluster node CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 12, + "type": "timeseries", + "title": "Cluster node RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "targets": [ + { + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 13, + "type": "table", + "title": "Problem pods (details)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "targets": [ + { + "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 14, + "type": "table", + "title": "Terminating >10m", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "targets": [ + { + "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "filterByValue", + "options": { + "match": "Value", + "operator": "gt", + "value": 600 + } + } + ] + }, + { + "id": 15, + "type": "timeseries", + "title": "Control plane CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 30 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 16, + "type": "timeseries", + "title": "Control plane RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 30 + }, + "targets": [ + { + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 17, + "type": "timeseries", + "title": "Root filesystem usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 37 + }, + "targets": [ + { + "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "7d" + }, + { + "id": 18, + "type": "bargauge", + "title": "Nodes closest to full root disks", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 37 + }, + "targets": [ + { + "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + } + }, + { + "id": 19, + "type": "stat", + "title": "Astreae usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 45 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 20, + "type": "stat", + "title": "Asteria usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 45 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 21, + "type": "stat", + "title": "Astreae free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 45 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 22, + "type": "stat", + "title": "Asteria free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 45 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 23, + "type": "table", + "title": "Astreae per-node usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 51 + }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 24, + "type": "table", + "title": "Asteria per-node usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 51 + }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 25, + "type": "text", + "title": "About this dashboard", + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 59 + }, + "datasource": null, + "options": { + "mode": "markdown", + "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders" + } + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "overview" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-12h", + "to": "now" + } +} diff --git a/services/monitoring/dashboards/atlas-pods.json b/services/monitoring/dashboards/atlas-pods.json new file mode 100644 index 0000000..91f80eb --- /dev/null +++ b/services/monitoring/dashboards/atlas-pods.json @@ -0,0 +1,137 @@ +{ + "uid": "atlas-pods", + "title": "Atlas Pods", + "folderUid": "atlas-pods", + "editable": true, + "panels": [ + { + "id": 1, + "type": "table", + "title": "Pods not running", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 2, + "type": "table", + "title": "CrashLoop / ImagePull", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 10 + }, + "targets": [ + { + "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 3, + "type": "table", + "title": "Terminating pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 20 + }, + "targets": [ + { + "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "filterByValue", + "options": { + "match": "Value", + "operator": "gt", + "value": 600 + } + } + ] + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "pods" + ] +} diff --git a/services/monitoring/dashboards/atlas-storage.json b/services/monitoring/dashboards/atlas-storage.json new file mode 100644 index 0000000..aa1948d --- /dev/null +++ b/services/monitoring/dashboards/atlas-storage.json @@ -0,0 +1,359 @@ +{ + "uid": "atlas-storage", + "title": "Atlas Storage", + "folderUid": "atlas-storage", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Astreae usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Asteria usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Astreae free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Asteria free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "timeseries", + "title": "Root filesystem", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 5 + }, + "targets": [ + { + "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d" + }, + { + "id": 6, + "type": "table", + "title": "Astreae nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 7, + "type": "table", + "title": "Asteria nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 14 + }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "storage" + ] +} diff --git a/services/monitoring/grafana-dashboard-sre.yaml b/services/monitoring/grafana-dashboard-nodes.yaml similarity index 53% rename from services/monitoring/grafana-dashboard-sre.yaml rename to services/monitoring/grafana-dashboard-nodes.yaml index d5d8dca..516f207 100644 --- a/services/monitoring/grafana-dashboard-sre.yaml +++ b/services/monitoring/grafana-dashboard-nodes.yaml @@ -1,38 +1,22 @@ -# services/monitoring/grafana-dashboard-sre.yaml +# services/monitoring/grafana-dashboard-nodes.yaml apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboard-sre + name: grafana-dashboard-nodes labels: grafana_dashboard: "1" data: - atlas-sre-overview.json: | + atlas-nodes.json: | { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, + "uid": "atlas-nodes", + "title": "Atlas Nodes", + "folderUid": "atlas-nodes", "editable": true, - "folderUid": "atlas-sre", - "graphTooltip": 0, - "links": [], "panels": [ { "id": 1, "type": "stat", - "title": "Ready nodes", + "title": "Node count", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -45,7 +29,7 @@ data: }, "targets": [ { - "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"}) / count(kube_node_info) * 100", + "expr": "count(kube_node_info)", "refId": "A" } ], @@ -56,23 +40,19 @@ data: }, "mappings": [], "thresholds": { - "mode": "percentage", + "mode": "absolute", "steps": [ { - "color": "red", + "color": "rgba(115, 115, 115, 1)", "value": null }, - { - "color": "yellow", - "value": 95 - }, { "color": "green", - "value": 99 + "value": 1 } ] }, - "unit": "percent" + "unit": "none" }, "overrides": [] }, @@ -93,7 +73,7 @@ data: { "id": 2, "type": "stat", - "title": "Pending pods", + "title": "Ready nodes", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -106,7 +86,7 @@ data: }, "targets": [ { - "expr": "sum(kube_pod_status_phase{phase=\"Pending\"})", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", "refId": "A" } ], @@ -120,16 +100,12 @@ data: "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", - "value": 3 - }, - { - "color": "red", - "value": 10 + "color": "green", + "value": 1 } ] }, @@ -154,7 +130,7 @@ data: { "id": 3, "type": "stat", - "title": "Unavailable deployment replicas", + "title": "Control plane CPU avg", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -167,8 +143,9 @@ data: }, "targets": [ { - "expr": "sum(kube_deployment_status_replicas_unavailable)", - "refId": "A" + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -181,20 +158,16 @@ data: "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", + "color": "green", "value": 1 - }, - { - "color": "red", - "value": 3 } ] }, - "unit": "none" + "unit": "percent" }, "overrides": [] }, @@ -209,13 +182,13 @@ data: "fields": "", "values": false }, - "textMode": "value" + "textMode": "value_and_name" } }, { "id": 4, "type": "stat", - "title": "Active alerts", + "title": "Control plane RAM avg", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -228,8 +201,9 @@ data: }, "targets": [ { - "expr": "sum(ALERTS{alertstate=\"firing\"})", - "refId": "A" + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -242,20 +216,16 @@ data: "mode": "absolute", "steps": [ { - "color": "green", + "color": "rgba(115, 115, 115, 1)", "value": null }, { - "color": "yellow", + "color": "green", "value": 1 - }, - { - "color": "red", - "value": 3 } ] }, - "unit": "none" + "unit": "percent" }, "overrides": [] }, @@ -270,20 +240,20 @@ data: "fields": "", "values": false }, - "textMode": "value" + "textMode": "value_and_name" } }, { "id": 5, "type": "timeseries", - "title": "Node CPU usage", + "title": "Node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 12, + "w": 24, "x": 0, "y": 5 }, @@ -303,7 +273,10 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right", + "calcs": [ + "last" + ] }, "tooltip": { "mode": "multi" @@ -313,16 +286,16 @@ data: { "id": 6, "type": "timeseries", - "title": "Node memory usage", + "title": "Node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 12, - "x": 12, - "y": 5 + "w": 24, + "x": 0, + "y": 14 }, "targets": [ { @@ -340,7 +313,10 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right", + "calcs": [ + "last" + ] }, "tooltip": { "mode": "multi" @@ -350,201 +326,22 @@ data: { "id": 7, "type": "timeseries", - "title": "Top pod CPU (5m avg)", + "title": "Root filesystem", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { "h": 9, - "w": 12, - "x": 0, - "y": 14 - }, - "targets": [ - { - "expr": "topk(5, sum(rate(container_cpu_usage_seconds_total{pod!=\"\",container!=\"\"}[5m])) by (namespace,pod))", - "refId": "A", - "legendFormat": "{{namespace}}/{{pod}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "cores" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 8, - "type": "timeseries", - "title": "Top pod memory working set", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 14 - }, - "targets": [ - { - "expr": "topk(5, sum(container_memory_working_set_bytes{pod!=\"\",container!=\"\"}) by (namespace,pod))", - "refId": "A", - "legendFormat": "{{namespace}}/{{pod}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "bytes" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 9, - "type": "bargauge", - "title": "Namespace restart rate (6h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, + "w": 24, "x": 0, "y": 23 }, "targets": [ { - "expr": "topk(8, sum(increase(kube_pod_container_status_restarts_total{namespace!=\"\"}[6h])) by (namespace))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - } - }, - { - "id": 10, - "type": "table", - "title": "Deployments missing replicas", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 23 - }, - "targets": [ - { - "expr": "topk(10, sum by (namespace,deployment) (kube_deployment_status_replicas_unavailable))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "showHeader": true - } - }, - { - "id": 11, - "type": "timeseries", - "title": "Pod phase breakdown", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 31 - }, - "targets": [ - { - "expr": "sum(kube_pod_status_phase) by (phase)", + "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", "refId": "A", - "legendFormat": "{{phase}}" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 12, - "type": "timeseries", - "title": "PVC usage (top 8)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 31 - }, - "targets": [ - { - "expr": "topk(8, sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100))", - "refId": "A", - "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}" + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -556,28 +353,26 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right" }, "tooltip": { "mode": "multi" } - } + }, + "timeFrom": "7d" } ], - "schemaVersion": 39, - "style": "dark", - "tags": [ - "atlas", - "sre" - ], - "templating": { - "list": [] - }, "time": { "from": "now-12h", "to": "now" }, - "title": "Atlas SRE Overview", - "uid": "atlas-sre", - "version": 4 + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "nodes" + ] } diff --git a/services/monitoring/grafana-dashboard-public.yaml b/services/monitoring/grafana-dashboard-overview.yaml similarity index 67% rename from services/monitoring/grafana-dashboard-public.yaml rename to services/monitoring/grafana-dashboard-overview.yaml index 35fa124..a20e05a 100644 --- a/services/monitoring/grafana-dashboard-public.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -1,13 +1,15 @@ -# services/monitoring/grafana-dashboard-public.yaml +# services/monitoring/grafana-dashboard-overview.yaml apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboard-public + name: grafana-dashboard-overview labels: grafana_dashboard: "1" data: - atlas-public-overview.json: | + atlas-overview.json: | { + "uid": "atlas-overview", + "title": "Atlas Overview", "annotations": { "list": [ { @@ -25,9 +27,28 @@ data: ] }, "editable": false, - "folderUid": "atlas-public", + "folderUid": "atlas-overview", "graphTooltip": 0, - "links": [], + "links": [ + { + "title": "Pods dashboard", + "type": "dashboard", + "dashboardUid": "atlas-pods", + "keepTime": false + }, + { + "title": "Nodes dashboard", + "type": "dashboard", + "dashboardUid": "atlas-nodes", + "keepTime": false + }, + { + "title": "Storage dashboard", + "type": "dashboard", + "dashboardUid": "atlas-storage", + "keepTime": false + } + ], "panels": [ { "id": 1, @@ -146,7 +167,7 @@ data: { "id": 3, "type": "stat", - "title": "Cluster nodes", + "title": "Control plane ready", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -159,7 +180,7 @@ data: }, "targets": [ { - "expr": "count(kube_node_info)", + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\",node=~\"titan-0a|titan-0b|titan-0c\"})", "refId": "A" } ], @@ -203,7 +224,7 @@ data: { "id": 4, "type": "stat", - "title": "Hottest node CPU", + "title": "Control plane schedulable", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -216,10 +237,182 @@ data: }, "targets": [ { - "expr": "topk(1, avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)))", + "expr": "sum(kube_node_spec_unschedulable{node=~\"titan-0a|titan-0b|titan-0c\"} == 0)", "refId": "A" } ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "stat", + "title": "Problem pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 0 + }, + "targets": [ + { + "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 6, + "type": "stat", + "title": "Stuck terminating", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "targets": [ + { + "expr": "sum(((time() - kube_pod_deletion_timestamp) > 600))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 7, + "type": "stat", + "title": "Hottest node: CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 24, + "y": 0 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" + } + ], "fieldConfig": { "defaults": { "color": { @@ -262,9 +455,9 @@ data: } }, { - "id": 5, + "id": 8, "type": "stat", - "title": "Hottest node memory", + "title": "Hottest node: RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -272,13 +465,14 @@ data: "gridPos": { "h": 5, "w": 4, - "x": 16, + "x": 28, "y": 0 }, "targets": [ { - "expr": "topk(1, avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))", - "refId": "A" + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { @@ -296,11 +490,11 @@ data: }, { "color": "yellow", - "value": 75 + "value": 70 }, { "color": "red", - "value": 90 + "value": 85 } ] }, @@ -323,68 +517,7 @@ data: } }, { - "id": 6, - "type": "stat", - "title": "Failed pods (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 5, - "w": 4, - "x": 20, - "y": 0 - }, - "targets": [ - { - "expr": "sum(increase(kube_pod_status_phase{phase=\"Failed\"}[24h]))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "red", - "value": 3 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - } - }, - { - "id": 7, + "id": 9, "type": "piechart", "title": "Namespace CPU share", "datasource": { @@ -400,12 +533,13 @@ data: "targets": [ { "expr": "topk(10, sum(rate(container_cpu_usage_seconds_total{namespace!=\"\",pod!=\"\",container!=\"\"}[5m])) by (namespace))", - "refId": "A" + "refId": "A", + "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { - "unit": "cores" + "unit": "percent" }, "overrides": [] }, @@ -425,9 +559,9 @@ data: } }, { - "id": 8, + "id": 10, "type": "piechart", - "title": "Namespace memory share", + "title": "Namespace RAM share", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -441,12 +575,13 @@ data: "targets": [ { "expr": "topk(10, sum(container_memory_working_set_bytes{namespace!=\"\",pod!=\"\",container!=\"\"}) by (namespace))", - "refId": "A" + "refId": "A", + "legendFormat": "{{namespace}}" } ], "fieldConfig": { "defaults": { - "unit": "bytes" + "unit": "percent" }, "overrides": [] }, @@ -455,7 +590,7 @@ data: "displayMode": "list", "placement": "right" }, - "pieType": "donut", + "pieType": "pie", "reduceOptions": { "calcs": [ "lastNotNull" @@ -466,9 +601,9 @@ data: } }, { - "id": 9, + "id": 11, "type": "timeseries", - "title": "Node CPU usage (per node)", + "title": "Cluster node CPU", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -495,7 +630,10 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right", + "calcs": [ + "last" + ] }, "tooltip": { "mode": "multi" @@ -503,9 +641,9 @@ data: } }, { - "id": 10, + "id": 12, "type": "timeseries", - "title": "Node memory usage (per node)", + "title": "Cluster node RAM", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -532,80 +670,20 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "right", + "calcs": [ + "last" + ] }, "tooltip": { "mode": "multi" } } }, - { - "id": 11, - "type": "table", - "title": "Key service availability", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 22 - }, - "targets": [ - { - "expr": "label_replace((sum by (deployment,namespace) (kube_deployment_status_replicas_available{deployment=~\"traefik|gitea|grafana\",namespace=~\"traefik|gitea|monitoring\"}) / sum by (deployment,namespace) (kube_deployment_spec_replicas{deployment=~\"traefik|gitea|grafana\",namespace=~\"traefik|gitea|monitoring\"})), \"service\", \"$1\", \"deployment\", \"(.*)\") or label_replace((sum by (statefulset,namespace) (kube_statefulset_status_replicas_ready{statefulset=~\"vault|alertmanager|victoria-metrics-single-server\",namespace=~\"vault|monitoring\"}) / sum by (statefulset,namespace) (kube_statefulset_status_replicas{statefulset=~\"vault|alertmanager|victoria-metrics-single-server\",namespace=~\"vault|monitoring\"})), \"service\", \"$1\", \"statefulset\", \"(.*)\")", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "custom": { - "align": "auto" - }, - "unit": "percent" - }, - "overrides": [] - }, - "options": { - "showHeader": true - } - }, - { - "id": 12, - "type": "table", - "title": "Failed pods by namespace (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 22 - }, - "targets": [ - { - "expr": "topk(10, sum by (namespace) (increase(kube_pod_status_phase{phase=\"Failed\"}[24h])))", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [] - }, - "options": { - "showHeader": true - } - }, { "id": 13, - "type": "timeseries", - "title": "Root filesystem usage per node", + "type": "table", + "title": "Problem pods (details)", "datasource": { "type": "prometheus", "uid": "atlas-vm" @@ -614,11 +692,91 @@ data: "h": 8, "w": 12, "x": 0, - "y": 29 + "y": 22 }, "targets": [ { - "expr": "avg by (node) (((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info))", + "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 14, + "type": "table", + "title": "Terminating >10m", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "targets": [ + { + "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "filterByValue", + "options": { + "match": "Value", + "operator": "gt", + "value": 600 + } + } + ] + }, + { + "id": 15, + "type": "timeseries", + "title": "Control plane CPU", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 30 + }, + "targets": [ + { + "expr": "avg by (node) ((((1 - label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\")) * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", "refId": "A", "legendFormat": "{{node}}" } @@ -640,7 +798,85 @@ data: } }, { - "id": 14, + "id": 16, + "type": "timeseries", + "title": "Control plane RAM", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 30 + }, + "targets": [ + { + "expr": "avg by (node) (((label_replace((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)) * on(node) group_left() kube_node_info{node=~\"titan-0a|titan-0b|titan-0c\"}", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 17, + "type": "timeseries", + "title": "Root filesystem usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 37 + }, + "targets": [ + { + "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "calcs": [ + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "7d" + }, + { + "id": 18, "type": "bargauge", "title": "Nodes closest to full root disks", "datasource": { @@ -651,19 +887,41 @@ data: "h": 8, "w": 12, "x": 12, - "y": 29 + "y": 37 }, "targets": [ { - "expr": "topk(8, avg by (node) (((label_replace(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}), \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\") * 100) * on (internal_ip) group_left(node) kube_node_info)))", - "refId": "A" + "expr": "topk(8, avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info))", + "refId": "A", + "legendFormat": "{{node}}" } ], "fieldConfig": { "defaults": { "unit": "percent", "min": 0, - "max": 100 + "max": 100, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } }, "overrides": [] }, @@ -680,7 +938,7 @@ data: } }, { - "id": 15, + "id": 19, "type": "stat", "title": "Astreae usage", "datasource": { @@ -688,14 +946,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 6, "w": 6, "x": 0, - "y": 37 + "y": 45 }, "targets": [ { - "expr": "(sum(longhorn_disk_usage_bytes{disk=~\"astreae-.*\"}) / sum(longhorn_disk_capacity_bytes{disk=~\"astreae-.*\"})) * 100", + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], @@ -741,7 +999,7 @@ data: } }, { - "id": 16, + "id": 20, "type": "stat", "title": "Asteria usage", "datasource": { @@ -749,14 +1007,14 @@ data: "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 6, "w": 6, "x": 6, - "y": 37 + "y": 45 }, "targets": [ { - "expr": "(sum(longhorn_disk_usage_bytes{disk=~\"asteria-.*\"}) / sum(longhorn_disk_capacity_bytes{disk=~\"asteria-.*\"})) * 100", + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", "refId": "A" } ], @@ -802,22 +1060,22 @@ data: } }, { - "id": 17, + "id": 21, "type": "stat", - "title": "Astreae schedulable", + "title": "Astreae free", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 6, "w": 6, "x": 12, - "y": 37 + "y": 45 }, "targets": [ { - "expr": "(sum(longhorn_disk_capacity_bytes{disk=~\"astreae-.*\"}) - sum(longhorn_disk_usage_bytes{disk=~\"astreae-.*\"}) - sum(longhorn_disk_reservation_bytes{disk=~\"astreae-.*\"}))", + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", "refId": "A" } ], @@ -859,22 +1117,22 @@ data: } }, { - "id": 18, + "id": 22, "type": "stat", - "title": "Asteria schedulable", + "title": "Asteria free", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 6, "w": 6, "x": 18, - "y": 37 + "y": 45 }, "targets": [ { - "expr": "(sum(longhorn_disk_capacity_bytes{disk=~\"asteria-.*\"}) - sum(longhorn_disk_usage_bytes{disk=~\"asteria-.*\"}) - sum(longhorn_disk_reservation_bytes{disk=~\"asteria-.*\"}))", + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", "refId": "A" } ], @@ -916,112 +1174,91 @@ data: } }, { - "id": 19, - "type": "piechart", - "title": "Longhorn node readiness", + "id": 23, + "type": "table", + "title": "Astreae per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 8, "w": 12, "x": 0, - "y": 44 + "y": 51 }, "targets": [ { - "expr": "sum(longhorn_node_status{condition=\"ready\"})", - "refId": "A", - "legendFormat": "Ready" - }, - { - "expr": "(longhorn_node_count_total - sum(longhorn_node_status{condition=\"ready\"}))", - "refId": "B", - "legendFormat": "Offline" + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "none" + "unit": "percent" }, "overrides": [] }, "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "donut", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} } - } + ] }, { - "id": 20, - "type": "piechart", - "title": "Longhorn disk schedulability", + "id": 24, + "type": "table", + "title": "Asteria per-node usage", "datasource": { "type": "prometheus", "uid": "atlas-vm" }, "gridPos": { - "h": 7, + "h": 8, "w": 12, "x": 12, - "y": 44 + "y": 51 }, "targets": [ { - "expr": "sum(sum by (node,disk) (longhorn_disk_status{condition=\"schedulable\"}))", - "refId": "A", - "legendFormat": "Schedulable" - }, - { - "expr": "(count(sum by (node,disk) (longhorn_disk_status{condition=\"ready\"})) - sum(sum by (node,disk) (longhorn_disk_status{condition=\"schedulable\"})))", - "refId": "B", - "legendFormat": "Blocked" + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" } ], "fieldConfig": { "defaults": { - "unit": "none" + "unit": "percent" }, "overrides": [] }, "options": { - "legend": { - "displayMode": "list", - "placement": "right" - }, - "pieType": "donut", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} } - } + ] }, { - "id": 21, + "id": 25, "type": "text", "title": "About this dashboard", "gridPos": { "h": 5, "w": 24, "x": 0, - "y": 51 + "y": 59 }, + "datasource": null, "options": { "mode": "markdown", - "content": "### Atlas at a glance\n- Flux-managed Pi + Jetson cluster with 20+ active nodes\n- Longhorn tiers: Astreae (3x replicas) & Asteria (2x replicas) tracked separately\n- Login for the SRE view with alert routing, Longhorn drilldowns, and workload burn rates" + "content": "### Atlas at a glance\n- Summary metrics above pull from dedicated Pods/Nodes/Storage dashboards\n- Click the dashboard links in the header to drill into details\n- Anonymous users see this overview; login unlocks the detailed folders" } } ], @@ -1030,7 +1267,7 @@ data: "style": "dark", "tags": [ "atlas", - "public" + "overview" ], "templating": { "list": [] @@ -1038,8 +1275,5 @@ data: "time": { "from": "now-12h", "to": "now" - }, - "title": "Atlas Public Overview", - "uid": "atlas-public", - "version": 5 + } } diff --git a/services/monitoring/grafana-dashboard-pods.yaml b/services/monitoring/grafana-dashboard-pods.yaml new file mode 100644 index 0000000..3b1f5da --- /dev/null +++ b/services/monitoring/grafana-dashboard-pods.yaml @@ -0,0 +1,146 @@ +# services/monitoring/grafana-dashboard-pods.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-pods + labels: + grafana_dashboard: "1" +data: + atlas-pods.json: | + { + "uid": "atlas-pods", + "title": "Atlas Pods", + "folderUid": "atlas-pods", + "editable": true, + "panels": [ + { + "id": 1, + "type": "table", + "title": "Pods not running", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod) group_left(phase) sum by (namespace,pod,phase) (kube_pod_status_phase{phase!~\"Running|Succeeded\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 2, + "type": "table", + "title": "CrashLoop / ImagePull", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 10 + }, + "targets": [ + { + "expr": "(time() - kube_pod_created) * on(namespace,pod) group_left(node) kube_pod_info * on(namespace,pod,container) group_left(reason) sum by (namespace,pod,reason) (kube_pod_container_status_waiting_reason{reason=~\"CrashLoopBackOff|ImagePullBackOff\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 3, + "type": "table", + "title": "Terminating pods", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 20 + }, + "targets": [ + { + "expr": "(time() - kube_pod_deletion_timestamp) * on(namespace,pod) group_left(node) kube_pod_info", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "filterByValue", + "options": { + "match": "Value", + "operator": "gt", + "value": 600 + } + } + ] + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "pods" + ] + } diff --git a/services/monitoring/grafana-dashboard-storage.yaml b/services/monitoring/grafana-dashboard-storage.yaml new file mode 100644 index 0000000..5b22804 --- /dev/null +++ b/services/monitoring/grafana-dashboard-storage.yaml @@ -0,0 +1,368 @@ +# services/monitoring/grafana-dashboard-storage.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-storage + labels: + grafana_dashboard: "1" +data: + atlas-storage.json: | + { + "uid": "atlas-storage", + "title": "Atlas Storage", + "folderUid": "atlas-storage", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Astreae usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "Asteria usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Astreae free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Asteria free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bytesSI" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "timeseries", + "title": "Root filesystem", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 5 + }, + "targets": [ + { + "expr": "avg by (node) (((1 - (label_replace(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs|overlay\"}, \"internal_ip\", \"$1\", \"instance\", \"([^:]+):.*\"))) * 100) * on (internal_ip) group_left(node) kube_node_info)", + "refId": "A", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + }, + "timeFrom": "30d" + }, + { + "id": 6, + "type": "table", + "title": "Astreae nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + }, + { + "id": 7, + "type": "table", + "title": "Asteria nodes", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 14 + }, + "targets": [ + { + "expr": "100 - (node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"} * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, + "options": { + "showHeader": true + }, + "transformations": [ + { + "id": "labelsToFields", + "options": {} + } + ] + } + ], + "time": { + "from": "now-12h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "storage" + ] + } diff --git a/services/monitoring/grafana-folders.yaml b/services/monitoring/grafana-folders.yaml index 503aaee..d390679 100644 --- a/services/monitoring/grafana-folders.yaml +++ b/services/monitoring/grafana-folders.yaml @@ -10,8 +10,8 @@ data: folders.yaml: | apiVersion: 1 folders: - - uid: atlas-public - title: Atlas Public + - uid: atlas-overview + title: Atlas Overview permissions: - role: Viewer permission: View @@ -19,8 +19,22 @@ data: permission: Edit - role: Admin permission: Admin - - uid: atlas-sre - title: Atlas SRE + - uid: atlas-pods + title: Atlas Pods + permissions: + - role: Editor + permission: View + - role: Admin + permission: Admin + - uid: atlas-nodes + title: Atlas Nodes + permissions: + - role: Editor + permission: View + - role: Admin + permission: Admin + - uid: atlas-storage + title: Atlas Storage permissions: - role: Editor permission: View diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index 4efae70..e23f903 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -244,8 +244,8 @@ spec: GF_SECURITY_ALLOW_EMBEDDING: "true" grafana.ini: server: - domain: atlas.metrics.bstein.dev - root_url: https://atlas.metrics.bstein.dev/ + domain: metrics.bstein.dev + root_url: https://metrics.bstein.dev/ auth.anonymous: hide_version: true users: @@ -256,12 +256,12 @@ spec: annotations: cert-manager.io/cluster-issuer: letsencrypt hosts: - - atlas.metrics.bstein.dev + - metrics.bstein.dev path: / tls: - - secretName: grafana-atlas-metrics-tls + - secretName: grafana-metrics-tls hosts: - - atlas.metrics.bstein.dev + - metrics.bstein.dev datasources: datasources.yaml: apiVersion: 1 @@ -278,25 +278,43 @@ spec: dashboardproviders.yaml: apiVersion: 1 providers: - - name: public + - name: overview orgId: 1 - folder: Atlas Public + folder: Atlas Overview type: file disableDeletion: false editable: false options: - path: /var/lib/grafana/dashboards/public - - name: sre + path: /var/lib/grafana/dashboards/overview + - name: pods orgId: 1 - folder: Atlas SRE + folder: Atlas Pods type: file disableDeletion: false editable: true options: - path: /var/lib/grafana/dashboards/sre + path: /var/lib/grafana/dashboards/pods + - name: nodes + orgId: 1 + folder: Atlas Nodes + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/nodes + - name: storage + orgId: 1 + folder: Atlas Storage + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/storage dashboardsConfigMaps: - public: grafana-dashboard-public - sre: grafana-dashboard-sre + overview: grafana-dashboard-overview + pods: grafana-dashboard-pods + nodes: grafana-dashboard-nodes + storage: grafana-dashboard-storage extraConfigmapMounts: - name: grafana-folders mountPath: /etc/grafana/provisioning/folders @@ -327,14 +345,14 @@ spec: annotations: cert-manager.io/cluster-issuer: letsencrypt hosts: - - host: atlas.alerts.bstein.dev + - host: alerts.bstein.dev paths: - path: / pathType: Prefix tls: - secretName: alerts-bstein-dev-tls hosts: - - atlas.alerts.bstein.dev + - alerts.bstein.dev config: global: resolve_timeout: 5m diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 73e7d23..282ee4f 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -5,7 +5,9 @@ namespace: monitoring resources: - namespace.yaml - rbac.yaml - - grafana-dashboard-public.yaml - - grafana-dashboard-sre.yaml + - grafana-dashboard-overview.yaml + - grafana-dashboard-pods.yaml + - grafana-dashboard-nodes.yaml + - grafana-dashboard-storage.yaml - grafana-folders.yaml - helmrelease.yaml