From 40dce5ee49078dda9586879b65b0d6360902e795 Mon Sep 17 00:00:00 2001 From: Brad Stein Date: Fri, 3 Apr 2026 14:55:16 -0300 Subject: [PATCH] monitoring: add power dashboard and reorder atlas overview rows --- scripts/dashboards_render_atlas.py | 440 +++-- .../monitoring/dashboards/atlas-overview.json | 1470 ++++++++++------- .../monitoring/dashboards/atlas-power.json | 553 +++++++ .../grafana-dashboard-overview.yaml | 1470 ++++++++++------- .../monitoring/grafana-dashboard-power.yaml | 562 +++++++ services/monitoring/helmrelease.yaml | 10 + services/monitoring/kustomization.yaml | 1 + 7 files changed, 3209 insertions(+), 1297 deletions(-) create mode 100644 services/monitoring/dashboards/atlas-power.json create mode 100644 services/monitoring/grafana-dashboard-power.yaml diff --git a/scripts/dashboards_render_atlas.py b/scripts/dashboards_render_atlas.py index 9b949492..80612df3 100644 --- a/scripts/dashboards_render_atlas.py +++ b/scripts/dashboards_render_atlas.py @@ -431,6 +431,16 @@ TEST_SUCCESS_RATE = ( TEST_FAILURES_24H = ( f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))' ) +HECATE_UPS_ON_BATTERY = "sum(hecate_ups_on_battery) or on() vector(0)" +HECATE_UPS_LOW_BATTERY = "sum(hecate_ups_low_battery) or on() vector(0)" +HECATE_UPS_RUNTIME_MIN = "min(hecate_ups_runtime_seconds) or on() vector(0)" +HECATE_UPS_RUNTIME_HEADROOM_PERCENT = ( + "100 * min(hecate_ups_runtime_seconds) / clamp_min(max(hecate_ups_threshold_seconds), 1)" +) +HECATE_UPS_TRIGGER_COUNT_1D = "increase(hecate_shutdown_triggers_total[1d]) or on() vector(0)" +CLIMATE_SENSOR_COUNT = "count(atlas_climate_temperature_celsius) or on() vector(0)" +CLIMATE_TEMP_MAX = "max(atlas_climate_temperature_celsius) or on() vector(0)" +CLIMATE_HUMIDITY_MAX = "max(atlas_climate_humidity_percent) or on() vector(0)" POSTGRES_CONN_USED = ( 'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") ' 'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")' @@ -1117,12 +1127,164 @@ def build_overview(): {"color": "green", "value": 98}, ], } + storage_panels = [ + (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), + (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"), + (25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"), + (26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"), + ] + for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): + panels.append( + stat_panel( + panel_id, + title, + expr, + {"h": 3, "w": 6, "x": 6 * idx, "y": 8}, + unit=unit, + thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, + links=link_to("atlas-storage"), + ) + ) + + panels.append( + stat_panel( + 40, + "UPS Sources On Battery", + HECATE_UPS_ON_BATTERY, + {"h": 3, "w": 6, "x": 0, "y": 11}, + unit="none", + instant=True, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 2}, + ], + }, + links=link_to("atlas-power"), + ) + ) + panels.append( + stat_panel( + 41, + "Lowest UPS Runtime", + HECATE_UPS_RUNTIME_MIN, + {"h": 3, "w": 6, "x": 6, "y": 11}, + unit="s", + decimals=0, + links=link_to("atlas-power"), + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 600}, + {"color": "yellow", "value": 1200}, + {"color": "green", "value": 1800}, + ], + }, + ) + ) + panels.append( + stat_panel( + 42, + "UPS Runtime Headroom", + HECATE_UPS_RUNTIME_HEADROOM_PERCENT, + {"h": 3, "w": 6, "x": 12, "y": 11}, + unit="percent", + decimals=1, + links=link_to("atlas-power"), + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 100}, + {"color": "yellow", "value": 110}, + {"color": "green", "value": 140}, + ], + }, + ) + ) + climate_panel = stat_panel( + 43, + "Climate Sensors Reporting", + CLIMATE_SENSOR_COUNT, + {"h": 3, "w": 6, "x": 18, "y": 11}, + unit="none", + decimals=0, + links=link_to("atlas-power"), + ) + climate_panel["description"] = "Climate metrics are reserved for future tent monitoring instrumentation." + panels.append(climate_panel) + + panels.append( + stat_panel( + 44, + "One-off Job Pods >1h", + f"sum(({ONEOFF_JOB_POD_AGE_HOURS}) > bool 1) or on() vector(0)", + {"h": 3, "w": 6, "x": 0, "y": 14}, + unit="none", + instant=True, + thresholds=count_thresholds, + links=link_to("atlas-jobs"), + ) + ) + panels.append( + stat_panel( + 45, + "Ariadne Attempts (24h)", + "sum(increase(ariadne_task_runs_total[24h]))", + {"h": 3, "w": 6, "x": 6, "y": 14}, + unit="none", + decimals=0, + links=link_to("atlas-jobs"), + ) + ) + test_success = stat_panel( + 46, + "Platform Test Success Rate", + TEST_SUCCESS_RATE, + {"h": 3, "w": 6, "x": 12, "y": 14}, + unit="percent", + decimals=2, + thresholds={ + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 90}, + {"color": "yellow", "value": 97}, + {"color": "green", "value": 99}, + ], + }, + links=link_to("atlas-jobs"), + ) + test_success["description"] = ( + "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. " + "This panel rolls up the shared Ariadne and Metis CI metrics from that internal dashboard." + ) + panels.append(test_success) + test_failures = stat_panel( + 47, + "Platform Test Failures (24h)", + "sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h]))", + {"h": 3, "w": 6, "x": 18, "y": 14}, + unit="none", + decimals=0, + instant=True, + thresholds=count_thresholds, + links=link_to("atlas-jobs"), + ) + test_failures["description"] = ( + "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query." + ) + panels.append(test_failures) + panels.append( stat_panel( 30, "Mail Sent (1d)", 'max(postmark_outbound_sent{window="1d"})', - {"h": 3, "w": 4, "x": 0, "y": 8}, + {"h": 3, "w": 4, "x": 0, "y": 17}, unit="none", links=link_to("atlas-mail"), ) @@ -1133,7 +1295,7 @@ def build_overview(): "type": "stat", "title": "Mail Bounces (1d)", "datasource": PROM_DS, - "gridPos": {"h": 3, "w": 4, "x": 8, "y": 8}, + "gridPos": {"h": 3, "w": 4, "x": 8, "y": 17}, "targets": [ { "expr": 'max(postmark_outbound_bounce_rate{window="1d"})', @@ -1179,7 +1341,7 @@ def build_overview(): 32, "Mail Success Rate (1d)", 'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)', - {"h": 3, "w": 4, "x": 4, "y": 8}, + {"h": 3, "w": 4, "x": 4, "y": 17}, unit="percent", thresholds=mail_success_thresholds, decimals=1, @@ -1191,7 +1353,7 @@ def build_overview(): 33, "Mail Limit Used (30d)", "max(postmark_sending_limit_used_percent)", - {"h": 3, "w": 4, "x": 12, "y": 8}, + {"h": 3, "w": 4, "x": 12, "y": 17}, unit="percent", thresholds=mail_limit_thresholds, decimals=1, @@ -1203,7 +1365,7 @@ def build_overview(): 34, "Postgres Connections Used", POSTGRES_CONN_USED, - {"h": 3, "w": 4, "x": 16, "y": 8}, + {"h": 3, "w": 4, "x": 16, "y": 17}, decimals=0, text_mode="name_and_value", legend="{{conn}}", @@ -1215,7 +1377,7 @@ def build_overview(): 35, "Postgres Hottest Connections", POSTGRES_CONN_HOTTEST, - {"h": 3, "w": 4, "x": 20, "y": 8}, + {"h": 3, "w": 4, "x": 20, "y": 17}, unit="none", decimals=0, text_mode="name_and_value", @@ -1224,121 +1386,6 @@ def build_overview(): ) ) - storage_panels = [ - (23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"), - (24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"), - (25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"), - (26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"), - ] - for idx, (panel_id, title, expr, unit) in enumerate(storage_panels): - panels.append( - stat_panel( - panel_id, - title, - expr, - {"h": 3, "w": 6, "x": 6 * idx, "y": 11}, - unit=unit, - thresholds=PERCENT_THRESHOLDS if unit == "percent" else None, - links=link_to("atlas-storage"), - ) - ) - - panels.append( - bargauge_panel( - 40, - "One-off Job Pods (age hours)", - ONEOFF_JOB_POD_AGE_HOURS, - {"h": 6, "w": 6, "x": 0, "y": 14}, - unit="h", - instant=True, - legend="{{namespace}}/{{pod}}", - thresholds=age_thresholds, - limit=8, - decimals=2, - ) - ) - panels.append( - { - "id": 41, - "type": "timeseries", - "title": "Ariadne Attempts / Failures", - "datasource": PROM_DS, - "gridPos": {"h": 6, "w": 6, "x": 6, "y": 14}, - "targets": [ - {"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"}, - {"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"}, - ], - "fieldConfig": { - "defaults": {"unit": "none"}, - "overrides": [ - { - "matcher": {"id": "byName", "options": "Attempts"}, - "properties": [ - {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}} - ], - }, - { - "matcher": {"id": "byName", "options": "Failures"}, - "properties": [ - {"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}} - ], - }, - ], - }, - "options": { - "legend": {"displayMode": "table", "placement": "right"}, - "tooltip": {"mode": "multi"}, - }, - } - ) - test_success = timeseries_panel( - 42, - "Platform Test Success Rate", - TEST_SUCCESS_RATE, - {"h": 6, "w": 6, "x": 12, "y": 14}, - unit="percent", - max_value=100, - legend=None, - legend_display="list", - ) - test_success["description"] = ( - "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. " - "This panel rolls up the shared Ariadne and Metis CI metrics from that internal dashboard." - ) - panels.append(test_success) - test_failures = bargauge_panel( - 43, - "Platform Tests with Failures (24h)", - TEST_FAILURES_24H, - {"h": 6, "w": 6, "x": 18, "y": 14}, - unit="none", - instant=True, - legend="{{result}}", - overrides=[ - { - "matcher": {"id": "byName", "options": "error"}, - "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}], - }, - { - "matcher": {"id": "byName", "options": "failed"}, - "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}], - }, - ], - thresholds={ - "mode": "absolute", - "steps": [ - {"color": "green", "value": None}, - {"color": "yellow", "value": 1}, - {"color": "orange", "value": 5}, - {"color": "red", "value": 10}, - ], - }, - ) - test_failures["description"] = ( - "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query." - ) - panels.append(test_failures) - cpu_scope = "$namespace_scope_cpu" gpu_scope = "$namespace_scope_gpu" ram_scope = "$namespace_scope_ram" @@ -2695,6 +2742,153 @@ def build_jobs_dashboard(): } +def build_power_dashboard(): + panels = [] + power_count_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 2}, + ], + } + runtime_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 600}, + {"color": "yellow", "value": 1200}, + {"color": "green", "value": 1800}, + ], + } + headroom_thresholds = { + "mode": "absolute", + "steps": [ + {"color": "red", "value": None}, + {"color": "orange", "value": 100}, + {"color": "yellow", "value": 110}, + {"color": "green", "value": 140}, + ], + } + + panels.append( + stat_panel( + 1, + "UPS Sources On Battery", + HECATE_UPS_ON_BATTERY, + {"h": 4, "w": 6, "x": 0, "y": 0}, + unit="none", + instant=True, + thresholds=power_count_thresholds, + ) + ) + panels.append( + stat_panel( + 2, + "UPS Sources Low Battery", + HECATE_UPS_LOW_BATTERY, + {"h": 4, "w": 6, "x": 6, "y": 0}, + unit="none", + instant=True, + thresholds=power_count_thresholds, + ) + ) + panels.append( + stat_panel( + 3, + "Lowest Runtime Remaining", + HECATE_UPS_RUNTIME_MIN, + {"h": 4, "w": 6, "x": 12, "y": 0}, + unit="s", + decimals=0, + instant=True, + thresholds=runtime_thresholds, + ) + ) + panels.append( + stat_panel( + 4, + "Runtime Headroom", + HECATE_UPS_RUNTIME_HEADROOM_PERCENT, + {"h": 4, "w": 6, "x": 18, "y": 0}, + unit="percent", + decimals=1, + instant=True, + thresholds=headroom_thresholds, + ) + ) + panels.append( + timeseries_panel( + 5, + "UPS Runtime by Source", + "hecate_ups_runtime_seconds", + {"h": 8, "w": 12, "x": 0, "y": 4}, + unit="s", + legend="{{instance}}/{{source}}", + legend_display="table", + legend_placement="right", + ) + ) + panels.append( + timeseries_panel( + 6, + "UPS Trigger Activity by Source", + "hecate_ups_trigger_active", + {"h": 8, "w": 12, "x": 12, "y": 4}, + unit="none", + legend="{{instance}}/{{source}}", + legend_display="table", + legend_placement="right", + ) + ) + climate_panel = stat_panel( + 7, + "Climate Sensors Reporting", + CLIMATE_SENSOR_COUNT, + {"h": 4, "w": 8, "x": 0, "y": 12}, + unit="none", + decimals=0, + instant=True, + ) + climate_panel["description"] = "Reserved for tent climate telemetry wiring." + panels.append(climate_panel) + panels.append( + stat_panel( + 8, + "Max Tent Temperature", + CLIMATE_TEMP_MAX, + {"h": 4, "w": 8, "x": 8, "y": 12}, + unit="celsius", + decimals=1, + instant=True, + ) + ) + panels.append( + stat_panel( + 9, + "Max Tent Humidity", + CLIMATE_HUMIDITY_MAX, + {"h": 4, "w": 8, "x": 16, "y": 12}, + unit="percent", + decimals=1, + instant=True, + ) + ) + + return { + "uid": "atlas-power", + "title": "Atlas Power", + "folderUid": PRIVATE_FOLDER, + "editable": True, + "panels": panels, + "time": {"from": "now-24h", "to": "now"}, + "annotations": {"list": []}, + "schemaVersion": 39, + "style": "dark", + "tags": ["atlas", "power", "climate"], + } + + def build_gpu_dashboard(): panels = [] gpu_scope = "$namespace_scope_gpu" @@ -2792,6 +2986,10 @@ DASHBOARDS = { "builder": build_jobs_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml", }, + "atlas-power": { + "builder": build_power_dashboard, + "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-power.yaml", + }, "atlas-gpu": { "builder": build_gpu_dashboard, "configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml", diff --git a/services/monitoring/dashboards/atlas-overview.json b/services/monitoring/dashboards/atlas-overview.json index 6e634610..f0bf43f7 100644 --- a/services/monitoring/dashboards/atlas-overview.json +++ b/services/monitoring/dashboards/atlas-overview.json @@ -786,6 +786,882 @@ } ] }, + { + "id": 23, + "type": "stat", + "title": "Astreae Usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 8 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 91.5 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] + }, + { + "id": 24, + "type": "stat", + "title": "Asteria Usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 8 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 91.5 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] + }, + { + "id": 25, + "type": "stat", + "title": "Astreae Free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 8 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "decbytes", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] + }, + { + "id": 26, + "type": "stat", + "title": "Asteria Free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 8 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "decbytes", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] + }, + { + "id": 40, + "type": "stat", + "title": "UPS Sources On Battery", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 11 + }, + "targets": [ + { + "expr": "sum(hecate_ups_on_battery) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-power dashboard", + "url": "/d/atlas-power", + "targetBlank": true + } + ] + }, + { + "id": 41, + "type": "stat", + "title": "Lowest UPS Runtime", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 11 + }, + "targets": [ + { + "expr": "min(hecate_ups_runtime_seconds) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 600 + }, + { + "color": "yellow", + "value": 1200 + }, + { + "color": "green", + "value": 1800 + } + ] + }, + "unit": "s", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-power dashboard", + "url": "/d/atlas-power", + "targetBlank": true + } + ] + }, + { + "id": 42, + "type": "stat", + "title": "UPS Runtime Headroom", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 11 + }, + "targets": [ + { + "expr": "100 * min(hecate_ups_runtime_seconds) / clamp_min(max(hecate_ups_threshold_seconds), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 100 + }, + { + "color": "yellow", + "value": 110 + }, + { + "color": "green", + "value": 140 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-power dashboard", + "url": "/d/atlas-power", + "targetBlank": true + } + ] + }, + { + "id": 43, + "type": "stat", + "title": "Climate Sensors Reporting", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 11 + }, + "targets": [ + { + "expr": "count(atlas_climate_temperature_celsius) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-power dashboard", + "url": "/d/atlas-power", + "targetBlank": true + } + ], + "description": "Climate metrics are reserved for future tent monitoring instrumentation." + }, + { + "id": 44, + "type": "stat", + "title": "One-off Job Pods >1h", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "sum((((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})) > bool 1) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-jobs dashboard", + "url": "/d/atlas-jobs", + "targetBlank": true + } + ] + }, + { + "id": 45, + "type": "stat", + "title": "Ariadne Attempts (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 14 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[24h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-jobs dashboard", + "url": "/d/atlas-jobs", + "targetBlank": true + } + ] + }, + { + "id": 46, + "type": "stat", + "title": "Platform Test Success Rate", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 14 + }, + "targets": [ + { + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "yellow", + "value": 97 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-jobs dashboard", + "url": "/d/atlas-jobs", + "targetBlank": true + } + ], + "description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. This panel rolls up the shared Ariadne and Metis CI metrics from that internal dashboard." + }, + { + "id": 47, + "type": "stat", + "title": "Platform Test Failures (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 14 + }, + "targets": [ + { + "expr": "sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h]))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-jobs dashboard", + "url": "/d/atlas-jobs", + "targetBlank": true + } + ], + "description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query." + }, { "id": 30, "type": "stat", @@ -798,7 +1674,7 @@ "h": 3, "w": 4, "x": 0, - "y": 8 + "y": 17 }, "targets": [ { @@ -865,7 +1741,7 @@ "h": 3, "w": 4, "x": 8, - "y": 8 + "y": 17 }, "targets": [ { @@ -970,7 +1846,7 @@ "h": 3, "w": 4, "x": 4, - "y": 8 + "y": 17 }, "targets": [ { @@ -1046,7 +1922,7 @@ "h": 3, "w": 4, "x": 12, - "y": 8 + "y": 17 }, "targets": [ { @@ -1122,7 +1998,7 @@ "h": 3, "w": 4, "x": 16, - "y": 8 + "y": 17 }, "targets": [ { @@ -1185,7 +2061,7 @@ "h": 3, "w": 4, "x": 20, - "y": 8 + "y": 17 }, "targets": [ { @@ -1236,588 +2112,6 @@ "textMode": "name_and_value" } }, - { - "id": 23, - "type": "stat", - "title": "Astreae Usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 0, - "y": 11 - }, - "targets": [ - { - "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 75 - }, - { - "color": "red", - "value": 91.5 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 24, - "type": "stat", - "title": "Asteria Usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 6, - "y": 11 - }, - "targets": [ - { - "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 75 - }, - { - "color": "red", - "value": 91.5 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 25, - "type": "stat", - "title": "Astreae Free", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 12, - "y": 11 - }, - "targets": [ - { - "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "decbytes", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 26, - "type": "stat", - "title": "Asteria Free", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 18, - "y": 11 - }, - "targets": [ - { - "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "decbytes", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 40, - "type": "bargauge", - "title": "One-off Job Pods (age hours)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 0, - "y": 14 - }, - "targets": [ - { - "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", - "refId": "A", - "legendFormat": "{{namespace}}/{{pod}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 6 - }, - { - "color": "orange", - "value": 24 - }, - { - "color": "red", - "value": 48 - } - ] - }, - "decimals": 2 - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - }, - { - "id": "limit", - "options": { - "limit": 8 - } - } - ] - }, - { - "id": 41, - "type": "timeseries", - "title": "Ariadne Attempts / Failures", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 6, - "y": 14 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", - "refId": "A", - "legendFormat": "Attempts" - }, - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "B", - "legendFormat": "Failures" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Attempts" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "green" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Failures" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "red" - } - } - ] - } - ] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 42, - "type": "timeseries", - "title": "Platform Test Success Rate", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 12, - "y": 14 - }, - "targets": [ - { - "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "max": 100 - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. This panel rolls up the shared Ariadne and Metis CI metrics from that internal dashboard." - }, - { - "id": 43, - "type": "bargauge", - "title": "Platform Tests with Failures (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 18, - "y": 14 - }, - "targets": [ - { - "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))", - "refId": "A", - "legendFormat": "{{result}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 5 - }, - { - "color": "red", - "value": 10 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "error" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "yellow" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "failed" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "red" - } - } - ] - } - ] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ], - "description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query." - }, { "id": 11, "type": "piechart", diff --git a/services/monitoring/dashboards/atlas-power.json b/services/monitoring/dashboards/atlas-power.json new file mode 100644 index 00000000..8e4c75d0 --- /dev/null +++ b/services/monitoring/dashboards/atlas-power.json @@ -0,0 +1,553 @@ +{ + "uid": "atlas-power", + "title": "Atlas Power", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "UPS Sources On Battery", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(hecate_ups_on_battery) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "UPS Sources Low Battery", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "sum(hecate_ups_low_battery) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Lowest Runtime Remaining", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "min(hecate_ups_runtime_seconds) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 600 + }, + { + "color": "yellow", + "value": 1200 + }, + { + "color": "green", + "value": 1800 + } + ] + }, + "unit": "s", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Runtime Headroom", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "100 * min(hecate_ups_runtime_seconds) / clamp_min(max(hecate_ups_threshold_seconds), 1)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 100 + }, + { + "color": "yellow", + "value": 110 + }, + { + "color": "green", + "value": 140 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "timeseries", + "title": "UPS Runtime by Source", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "targets": [ + { + "expr": "hecate_ups_runtime_seconds", + "refId": "A", + "legendFormat": "{{instance}}/{{source}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 6, + "type": "timeseries", + "title": "UPS Trigger Activity by Source", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "targets": [ + { + "expr": "hecate_ups_trigger_active", + "refId": "A", + "legendFormat": "{{instance}}/{{source}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 7, + "type": "stat", + "title": "Climate Sensors Reporting", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "count(atlas_climate_temperature_celsius) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "description": "Reserved for tent climate telemetry wiring." + }, + { + "id": 8, + "type": "stat", + "title": "Max Tent Temperature", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 12 + }, + "targets": [ + { + "expr": "max(atlas_climate_temperature_celsius) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "celsius", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 9, + "type": "stat", + "title": "Max Tent Humidity", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 12 + }, + "targets": [ + { + "expr": "max(atlas_climate_humidity_percent) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + } + ], + "time": { + "from": "now-24h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "power", + "climate" + ] +} diff --git a/services/monitoring/grafana-dashboard-overview.yaml b/services/monitoring/grafana-dashboard-overview.yaml index fea1544d..471a57a6 100644 --- a/services/monitoring/grafana-dashboard-overview.yaml +++ b/services/monitoring/grafana-dashboard-overview.yaml @@ -795,6 +795,882 @@ data: } ] }, + { + "id": 23, + "type": "stat", + "title": "Astreae Usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 8 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 91.5 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] + }, + { + "id": 24, + "type": "stat", + "title": "Asteria Usage", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 8 + }, + "targets": [ + { + "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "orange", + "value": 75 + }, + { + "color": "red", + "value": 91.5 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] + }, + { + "id": 25, + "type": "stat", + "title": "Astreae Free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 8 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "decbytes", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] + }, + { + "id": 26, + "type": "stat", + "title": "Asteria Free", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 8 + }, + "targets": [ + { + "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "decbytes", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-storage dashboard", + "url": "/d/atlas-storage", + "targetBlank": true + } + ] + }, + { + "id": 40, + "type": "stat", + "title": "UPS Sources On Battery", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 11 + }, + "targets": [ + { + "expr": "sum(hecate_ups_on_battery) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-power dashboard", + "url": "/d/atlas-power", + "targetBlank": true + } + ] + }, + { + "id": 41, + "type": "stat", + "title": "Lowest UPS Runtime", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 11 + }, + "targets": [ + { + "expr": "min(hecate_ups_runtime_seconds) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 600 + }, + { + "color": "yellow", + "value": 1200 + }, + { + "color": "green", + "value": 1800 + } + ] + }, + "unit": "s", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-power dashboard", + "url": "/d/atlas-power", + "targetBlank": true + } + ] + }, + { + "id": 42, + "type": "stat", + "title": "UPS Runtime Headroom", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 11 + }, + "targets": [ + { + "expr": "100 * min(hecate_ups_runtime_seconds) / clamp_min(max(hecate_ups_threshold_seconds), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 100 + }, + { + "color": "yellow", + "value": 110 + }, + { + "color": "green", + "value": 140 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-power dashboard", + "url": "/d/atlas-power", + "targetBlank": true + } + ] + }, + { + "id": 43, + "type": "stat", + "title": "Climate Sensors Reporting", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 11 + }, + "targets": [ + { + "expr": "count(atlas_climate_temperature_celsius) or on() vector(0)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-power dashboard", + "url": "/d/atlas-power", + "targetBlank": true + } + ], + "description": "Climate metrics are reserved for future tent monitoring instrumentation." + }, + { + "id": 44, + "type": "stat", + "title": "One-off Job Pods >1h", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 14 + }, + "targets": [ + { + "expr": "sum((((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"})) > bool 1) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-jobs dashboard", + "url": "/d/atlas-jobs", + "targetBlank": true + } + ] + }, + { + "id": 45, + "type": "stat", + "title": "Ariadne Attempts (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 14 + }, + "targets": [ + { + "expr": "sum(increase(ariadne_task_runs_total[24h]))", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-jobs dashboard", + "url": "/d/atlas-jobs", + "targetBlank": true + } + ] + }, + { + "id": 46, + "type": "stat", + "title": "Platform Test Success Rate", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 14 + }, + "targets": [ + { + "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 90 + }, + { + "color": "yellow", + "value": 97 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-jobs dashboard", + "url": "/d/atlas-jobs", + "targetBlank": true + } + ], + "description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. This panel rolls up the shared Ariadne and Metis CI metrics from that internal dashboard." + }, + { + "id": 47, + "type": "stat", + "title": "Platform Test Failures (24h)", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 14 + }, + "targets": [ + { + "expr": "sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h]))", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "links": [ + { + "title": "Open atlas-jobs dashboard", + "url": "/d/atlas-jobs", + "targetBlank": true + } + ], + "description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query." + }, { "id": 30, "type": "stat", @@ -807,7 +1683,7 @@ data: "h": 3, "w": 4, "x": 0, - "y": 8 + "y": 17 }, "targets": [ { @@ -874,7 +1750,7 @@ data: "h": 3, "w": 4, "x": 8, - "y": 8 + "y": 17 }, "targets": [ { @@ -979,7 +1855,7 @@ data: "h": 3, "w": 4, "x": 4, - "y": 8 + "y": 17 }, "targets": [ { @@ -1055,7 +1931,7 @@ data: "h": 3, "w": 4, "x": 12, - "y": 8 + "y": 17 }, "targets": [ { @@ -1131,7 +2007,7 @@ data: "h": 3, "w": 4, "x": 16, - "y": 8 + "y": 17 }, "targets": [ { @@ -1194,7 +2070,7 @@ data: "h": 3, "w": 4, "x": 20, - "y": 8 + "y": 17 }, "targets": [ { @@ -1245,588 +2121,6 @@ data: "textMode": "name_and_value" } }, - { - "id": 23, - "type": "stat", - "title": "Astreae Usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 0, - "y": 11 - }, - "targets": [ - { - "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"}) * 100)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 75 - }, - { - "color": "red", - "value": 91.5 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 24, - "type": "stat", - "title": "Asteria Usage", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 6, - "y": 11 - }, - "targets": [ - { - "expr": "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"}) * 100)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "orange", - "value": 75 - }, - { - "color": "red", - "value": 91.5 - } - ] - }, - "unit": "percent", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 25, - "type": "stat", - "title": "Astreae Free", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 12, - "y": 11 - }, - "targets": [ - { - "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/astreae\",fstype!~\"tmpfs|overlay\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "decbytes", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 26, - "type": "stat", - "title": "Asteria Free", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 3, - "w": 6, - "x": 18, - "y": 11 - }, - "targets": [ - { - "expr": "sum(node_filesystem_avail_bytes{mountpoint=\"/mnt/asteria\",fstype!~\"tmpfs|overlay\"})", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(115, 115, 115, 1)", - "value": null - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "decbytes", - "custom": { - "displayMode": "auto" - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value" - }, - "links": [ - { - "title": "Open atlas-storage dashboard", - "url": "/d/atlas-storage", - "targetBlank": true - } - ] - }, - { - "id": 40, - "type": "bargauge", - "title": "One-off Job Pods (age hours)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 0, - "y": 14 - }, - "targets": [ - { - "expr": "sort_desc(((time() - kube_pod_start_time{pod!=\"\"}) / 3600) * on(namespace,pod) group_left(owner_name) (kube_pod_owner{owner_kind=\"Job\"} unless on(namespace, owner_name) label_replace(kube_job_owner{owner_kind=\"CronJob\"}, \"owner_name\", \"$1\", \"job_name\", \"(.*)\")) * on(namespace,pod) group_left(phase) max by (namespace,pod,phase) (kube_pod_status_phase{phase=~\"Running|Succeeded\"}))", - "refId": "A", - "legendFormat": "{{namespace}}/{{pod}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "h", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 6 - }, - { - "color": "orange", - "value": 24 - }, - { - "color": "red", - "value": 48 - } - ] - }, - "decimals": 2 - }, - "overrides": [] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - }, - { - "id": "limit", - "options": { - "limit": 8 - } - } - ] - }, - { - "id": 41, - "type": "timeseries", - "title": "Ariadne Attempts / Failures", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 6, - "y": 14 - }, - "targets": [ - { - "expr": "sum(increase(ariadne_task_runs_total[$__interval]))", - "refId": "A", - "legendFormat": "Attempts" - }, - { - "expr": "sum(increase(ariadne_task_runs_total{status=\"error\"}[$__interval]))", - "refId": "B", - "legendFormat": "Failures" - } - ], - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Attempts" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "green" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Failures" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "red" - } - } - ] - } - ] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "right" - }, - "tooltip": { - "mode": "multi" - } - } - }, - { - "id": 42, - "type": "timeseries", - "title": "Platform Test Success Rate", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 12, - "y": 14 - }, - "targets": [ - { - "expr": "100 * sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=\"passed\"}[30d])) / clamp_min(sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"passed|failed|error\"}[30d])), 1)", - "refId": "A" - } - ], - "fieldConfig": { - "defaults": { - "unit": "percent", - "max": 100 - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "description": "Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. This panel rolls up the shared Ariadne and Metis CI metrics from that internal dashboard." - }, - { - "id": 43, - "type": "bargauge", - "title": "Platform Tests with Failures (24h)", - "datasource": { - "type": "prometheus", - "uid": "atlas-vm" - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 18, - "y": 14 - }, - "targets": [ - { - "expr": "sort_desc(sum by (result) (max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h])))", - "refId": "A", - "legendFormat": "{{result}}", - "instant": true - } - ], - "fieldConfig": { - "defaults": { - "unit": "none", - "min": 0, - "max": null, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "orange", - "value": 5 - }, - { - "color": "red", - "value": 10 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "error" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "yellow" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "failed" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "red" - } - } - ] - } - ] - }, - "options": { - "displayMode": "gradient", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "transformations": [ - { - "id": "sortBy", - "options": { - "fields": [ - "Value" - ], - "order": "desc" - } - } - ], - "description": "This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query." - }, { "id": 11, "type": "piechart", diff --git a/services/monitoring/grafana-dashboard-power.yaml b/services/monitoring/grafana-dashboard-power.yaml new file mode 100644 index 00000000..21b611c3 --- /dev/null +++ b/services/monitoring/grafana-dashboard-power.yaml @@ -0,0 +1,562 @@ +# services/monitoring/grafana-dashboard-power.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-power + labels: + grafana_dashboard: "1" +data: + atlas-power.json: | + { + "uid": "atlas-power", + "title": "Atlas Power", + "folderUid": "atlas-internal", + "editable": true, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "UPS Sources On Battery", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "targets": [ + { + "expr": "sum(hecate_ups_on_battery) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 2, + "type": "stat", + "title": "UPS Sources Low Battery", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "targets": [ + { + "expr": "sum(hecate_ups_low_battery) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 2 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 3, + "type": "stat", + "title": "Lowest Runtime Remaining", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "targets": [ + { + "expr": "min(hecate_ups_runtime_seconds) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 600 + }, + { + "color": "yellow", + "value": 1200 + }, + { + "color": "green", + "value": 1800 + } + ] + }, + "unit": "s", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 4, + "type": "stat", + "title": "Runtime Headroom", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "targets": [ + { + "expr": "100 * min(hecate_ups_runtime_seconds) / clamp_min(max(hecate_ups_threshold_seconds), 1)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 100 + }, + { + "color": "yellow", + "value": 110 + }, + { + "color": "green", + "value": 140 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 5, + "type": "timeseries", + "title": "UPS Runtime by Source", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "targets": [ + { + "expr": "hecate_ups_runtime_seconds", + "refId": "A", + "legendFormat": "{{instance}}/{{source}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 6, + "type": "timeseries", + "title": "UPS Trigger Activity by Source", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "targets": [ + { + "expr": "hecate_ups_trigger_active", + "refId": "A", + "legendFormat": "{{instance}}/{{source}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi" + } + } + }, + { + "id": 7, + "type": "stat", + "title": "Climate Sensors Reporting", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 12 + }, + "targets": [ + { + "expr": "count(atlas_climate_temperature_celsius) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none", + "custom": { + "displayMode": "auto" + }, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + }, + "description": "Reserved for tent climate telemetry wiring." + }, + { + "id": 8, + "type": "stat", + "title": "Max Tent Temperature", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 12 + }, + "targets": [ + { + "expr": "max(atlas_climate_temperature_celsius) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "celsius", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + }, + { + "id": 9, + "type": "stat", + "title": "Max Tent Humidity", + "datasource": { + "type": "prometheus", + "uid": "atlas-vm" + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 12 + }, + "targets": [ + { + "expr": "max(atlas_climate_humidity_percent) or on() vector(0)", + "refId": "A", + "instant": true + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(115, 115, 115, 1)", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percent", + "custom": { + "displayMode": "auto" + }, + "decimals": 1 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value" + } + } + ], + "time": { + "from": "now-24h", + "to": "now" + }, + "annotations": { + "list": [] + }, + "schemaVersion": 39, + "style": "dark", + "tags": [ + "atlas", + "power", + "climate" + ] + } diff --git a/services/monitoring/helmrelease.yaml b/services/monitoring/helmrelease.yaml index e7363536..670bf1f0 100644 --- a/services/monitoring/helmrelease.yaml +++ b/services/monitoring/helmrelease.yaml @@ -224,6 +224,16 @@ spec: target_label: instance replacement: titan-jh + # --- Hecate power telemetry (host-level daemon on UPS hosts) --- + - job_name: "hecate-power" + static_configs: + - targets: ["192.168.22.10:9560"] + labels: + instance: titan-db + - targets: ["192.168.22.26:9560"] + labels: + instance: titan-24 + # --- cert-manager (pods expose on 9402) --- - job_name: "cert-manager" kubernetes_sd_configs: [{ role: pod }] diff --git a/services/monitoring/kustomization.yaml b/services/monitoring/kustomization.yaml index 23c1595a..63bf8ffb 100644 --- a/services/monitoring/kustomization.yaml +++ b/services/monitoring/kustomization.yaml @@ -13,6 +13,7 @@ resources: - grafana-dashboard-storage.yaml - grafana-dashboard-network.yaml - grafana-dashboard-gpu.yaml + - grafana-dashboard-power.yaml - grafana-dashboard-mail.yaml - grafana-dashboard-jobs.yaml - dcgm-exporter.yaml