monitoring: add power dashboard and reorder atlas overview rows

This commit is contained in:
Brad Stein 2026-04-03 14:55:16 -03:00
parent e418183f56
commit bc9bf0310a
7 changed files with 3209 additions and 1297 deletions

View File

@ -431,6 +431,16 @@ TEST_SUCCESS_RATE = (
TEST_FAILURES_24H = (
f'sum by (result) (max_over_time(ariadne_ci_tests_total{{{TEST_REPO_SELECTOR},result=~"failed|error"}}[24h]))'
)
HECATE_UPS_ON_BATTERY = "sum(hecate_ups_on_battery) or on() vector(0)"
HECATE_UPS_LOW_BATTERY = "sum(hecate_ups_low_battery) or on() vector(0)"
HECATE_UPS_RUNTIME_MIN = "min(hecate_ups_runtime_seconds) or on() vector(0)"
HECATE_UPS_RUNTIME_HEADROOM_PERCENT = (
"100 * min(hecate_ups_runtime_seconds) / clamp_min(max(hecate_ups_threshold_seconds), 1)"
)
HECATE_UPS_TRIGGER_COUNT_1D = "increase(hecate_shutdown_triggers_total[1d]) or on() vector(0)"
CLIMATE_SENSOR_COUNT = "count(atlas_climate_temperature_celsius) or on() vector(0)"
CLIMATE_TEMP_MAX = "max(atlas_climate_temperature_celsius) or on() vector(0)"
CLIMATE_HUMIDITY_MAX = "max(atlas_climate_humidity_percent) or on() vector(0)"
POSTGRES_CONN_USED = (
'label_replace(sum(pg_stat_activity_count), "conn", "used", "__name__", ".*") '
'or label_replace(max(pg_settings_max_connections), "conn", "max", "__name__", ".*")'
@ -1117,12 +1127,164 @@ def build_overview():
{"color": "green", "value": 98},
],
}
storage_panels = [
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
(24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
(25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"),
(26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"),
]
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
panels.append(
stat_panel(
panel_id,
title,
expr,
{"h": 3, "w": 6, "x": 6 * idx, "y": 8},
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
links=link_to("atlas-storage"),
)
)
panels.append(
stat_panel(
40,
"UPS Sources On Battery",
HECATE_UPS_ON_BATTERY,
{"h": 3, "w": 6, "x": 0, "y": 11},
unit="none",
instant=True,
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "red", "value": 2},
],
},
links=link_to("atlas-power"),
)
)
panels.append(
stat_panel(
41,
"Lowest UPS Runtime",
HECATE_UPS_RUNTIME_MIN,
{"h": 3, "w": 6, "x": 6, "y": 11},
unit="s",
decimals=0,
links=link_to("atlas-power"),
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 600},
{"color": "yellow", "value": 1200},
{"color": "green", "value": 1800},
],
},
)
)
panels.append(
stat_panel(
42,
"UPS Runtime Headroom",
HECATE_UPS_RUNTIME_HEADROOM_PERCENT,
{"h": 3, "w": 6, "x": 12, "y": 11},
unit="percent",
decimals=1,
links=link_to("atlas-power"),
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 100},
{"color": "yellow", "value": 110},
{"color": "green", "value": 140},
],
},
)
)
climate_panel = stat_panel(
43,
"Climate Sensors Reporting",
CLIMATE_SENSOR_COUNT,
{"h": 3, "w": 6, "x": 18, "y": 11},
unit="none",
decimals=0,
links=link_to("atlas-power"),
)
climate_panel["description"] = "Climate metrics are reserved for future tent monitoring instrumentation."
panels.append(climate_panel)
panels.append(
stat_panel(
44,
"One-off Job Pods >1h",
f"sum(({ONEOFF_JOB_POD_AGE_HOURS}) > bool 1) or on() vector(0)",
{"h": 3, "w": 6, "x": 0, "y": 14},
unit="none",
instant=True,
thresholds=count_thresholds,
links=link_to("atlas-jobs"),
)
)
panels.append(
stat_panel(
45,
"Ariadne Attempts (24h)",
"sum(increase(ariadne_task_runs_total[24h]))",
{"h": 3, "w": 6, "x": 6, "y": 14},
unit="none",
decimals=0,
links=link_to("atlas-jobs"),
)
)
test_success = stat_panel(
46,
"Platform Test Success Rate",
TEST_SUCCESS_RATE,
{"h": 3, "w": 6, "x": 12, "y": 14},
unit="percent",
decimals=2,
thresholds={
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 90},
{"color": "yellow", "value": 97},
{"color": "green", "value": 99},
],
},
links=link_to("atlas-jobs"),
)
test_success["description"] = (
"Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. "
"This panel rolls up the shared Ariadne and Metis CI metrics from that internal dashboard."
)
panels.append(test_success)
test_failures = stat_panel(
47,
"Platform Test Failures (24h)",
"sum(max_over_time(ariadne_ci_tests_total{repo=~\"ariadne|metis\",result=~\"failed|error\"}[24h]))",
{"h": 3, "w": 6, "x": 18, "y": 14},
unit="none",
decimals=0,
instant=True,
thresholds=count_thresholds,
links=link_to("atlas-jobs"),
)
test_failures["description"] = (
"This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
)
panels.append(test_failures)
panels.append(
stat_panel(
30,
"Mail Sent (1d)",
'max(postmark_outbound_sent{window="1d"})',
{"h": 3, "w": 4, "x": 0, "y": 8},
{"h": 3, "w": 4, "x": 0, "y": 17},
unit="none",
links=link_to("atlas-mail"),
)
@ -1133,7 +1295,7 @@ def build_overview():
"type": "stat",
"title": "Mail Bounces (1d)",
"datasource": PROM_DS,
"gridPos": {"h": 3, "w": 4, "x": 8, "y": 8},
"gridPos": {"h": 3, "w": 4, "x": 8, "y": 17},
"targets": [
{
"expr": 'max(postmark_outbound_bounce_rate{window="1d"})',
@ -1179,7 +1341,7 @@ def build_overview():
32,
"Mail Success Rate (1d)",
'clamp_min(100 - max(postmark_outbound_bounce_rate{window="1d"}), 0)',
{"h": 3, "w": 4, "x": 4, "y": 8},
{"h": 3, "w": 4, "x": 4, "y": 17},
unit="percent",
thresholds=mail_success_thresholds,
decimals=1,
@ -1191,7 +1353,7 @@ def build_overview():
33,
"Mail Limit Used (30d)",
"max(postmark_sending_limit_used_percent)",
{"h": 3, "w": 4, "x": 12, "y": 8},
{"h": 3, "w": 4, "x": 12, "y": 17},
unit="percent",
thresholds=mail_limit_thresholds,
decimals=1,
@ -1203,7 +1365,7 @@ def build_overview():
34,
"Postgres Connections Used",
POSTGRES_CONN_USED,
{"h": 3, "w": 4, "x": 16, "y": 8},
{"h": 3, "w": 4, "x": 16, "y": 17},
decimals=0,
text_mode="name_and_value",
legend="{{conn}}",
@ -1215,7 +1377,7 @@ def build_overview():
35,
"Postgres Hottest Connections",
POSTGRES_CONN_HOTTEST,
{"h": 3, "w": 4, "x": 20, "y": 8},
{"h": 3, "w": 4, "x": 20, "y": 17},
unit="none",
decimals=0,
text_mode="name_and_value",
@ -1224,121 +1386,6 @@ def build_overview():
)
)
storage_panels = [
(23, "Astreae Usage", astreae_usage_expr("/mnt/astreae"), "percent"),
(24, "Asteria Usage", astreae_usage_expr("/mnt/asteria"), "percent"),
(25, "Astreae Free", astreae_free_expr("/mnt/astreae"), "decbytes"),
(26, "Asteria Free", astreae_free_expr("/mnt/asteria"), "decbytes"),
]
for idx, (panel_id, title, expr, unit) in enumerate(storage_panels):
panels.append(
stat_panel(
panel_id,
title,
expr,
{"h": 3, "w": 6, "x": 6 * idx, "y": 11},
unit=unit,
thresholds=PERCENT_THRESHOLDS if unit == "percent" else None,
links=link_to("atlas-storage"),
)
)
panels.append(
bargauge_panel(
40,
"One-off Job Pods (age hours)",
ONEOFF_JOB_POD_AGE_HOURS,
{"h": 6, "w": 6, "x": 0, "y": 14},
unit="h",
instant=True,
legend="{{namespace}}/{{pod}}",
thresholds=age_thresholds,
limit=8,
decimals=2,
)
)
panels.append(
{
"id": 41,
"type": "timeseries",
"title": "Ariadne Attempts / Failures",
"datasource": PROM_DS,
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 14},
"targets": [
{"expr": ARIADNE_TASK_ATTEMPTS_SERIES, "refId": "A", "legendFormat": "Attempts"},
{"expr": ARIADNE_TASK_FAILURES_SERIES, "refId": "B", "legendFormat": "Failures"},
],
"fieldConfig": {
"defaults": {"unit": "none"},
"overrides": [
{
"matcher": {"id": "byName", "options": "Attempts"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}
],
},
{
"matcher": {"id": "byName", "options": "Failures"},
"properties": [
{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}
],
},
],
},
"options": {
"legend": {"displayMode": "table", "placement": "right"},
"tooltip": {"mode": "multi"},
},
}
)
test_success = timeseries_panel(
42,
"Platform Test Success Rate",
TEST_SUCCESS_RATE,
{"h": 6, "w": 6, "x": 12, "y": 14},
unit="percent",
max_value=100,
legend=None,
legend_display="list",
)
test_success["description"] = (
"Atlas Overview mirrors the Atlas Jobs internal dashboard for automation test health. "
"This panel rolls up the shared Ariadne and Metis CI metrics from that internal dashboard."
)
panels.append(test_success)
test_failures = bargauge_panel(
43,
"Platform Tests with Failures (24h)",
TEST_FAILURES_24H,
{"h": 6, "w": 6, "x": 18, "y": 14},
unit="none",
instant=True,
legend="{{result}}",
overrides=[
{
"matcher": {"id": "byName", "options": "error"},
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}],
},
{
"matcher": {"id": "byName", "options": "failed"},
"properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}],
},
],
thresholds={
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "orange", "value": 5},
{"color": "red", "value": 10},
],
},
)
test_failures["description"] = (
"This summary is sourced from the Atlas Jobs internal dashboard rather than a separate overview-only query."
)
panels.append(test_failures)
cpu_scope = "$namespace_scope_cpu"
gpu_scope = "$namespace_scope_gpu"
ram_scope = "$namespace_scope_ram"
@ -2695,6 +2742,153 @@ def build_jobs_dashboard():
}
def build_power_dashboard():
panels = []
power_count_thresholds = {
"mode": "absolute",
"steps": [
{"color": "green", "value": None},
{"color": "yellow", "value": 1},
{"color": "red", "value": 2},
],
}
runtime_thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 600},
{"color": "yellow", "value": 1200},
{"color": "green", "value": 1800},
],
}
headroom_thresholds = {
"mode": "absolute",
"steps": [
{"color": "red", "value": None},
{"color": "orange", "value": 100},
{"color": "yellow", "value": 110},
{"color": "green", "value": 140},
],
}
panels.append(
stat_panel(
1,
"UPS Sources On Battery",
HECATE_UPS_ON_BATTERY,
{"h": 4, "w": 6, "x": 0, "y": 0},
unit="none",
instant=True,
thresholds=power_count_thresholds,
)
)
panels.append(
stat_panel(
2,
"UPS Sources Low Battery",
HECATE_UPS_LOW_BATTERY,
{"h": 4, "w": 6, "x": 6, "y": 0},
unit="none",
instant=True,
thresholds=power_count_thresholds,
)
)
panels.append(
stat_panel(
3,
"Lowest Runtime Remaining",
HECATE_UPS_RUNTIME_MIN,
{"h": 4, "w": 6, "x": 12, "y": 0},
unit="s",
decimals=0,
instant=True,
thresholds=runtime_thresholds,
)
)
panels.append(
stat_panel(
4,
"Runtime Headroom",
HECATE_UPS_RUNTIME_HEADROOM_PERCENT,
{"h": 4, "w": 6, "x": 18, "y": 0},
unit="percent",
decimals=1,
instant=True,
thresholds=headroom_thresholds,
)
)
panels.append(
timeseries_panel(
5,
"UPS Runtime by Source",
"hecate_ups_runtime_seconds",
{"h": 8, "w": 12, "x": 0, "y": 4},
unit="s",
legend="{{instance}}/{{source}}",
legend_display="table",
legend_placement="right",
)
)
panels.append(
timeseries_panel(
6,
"UPS Trigger Activity by Source",
"hecate_ups_trigger_active",
{"h": 8, "w": 12, "x": 12, "y": 4},
unit="none",
legend="{{instance}}/{{source}}",
legend_display="table",
legend_placement="right",
)
)
climate_panel = stat_panel(
7,
"Climate Sensors Reporting",
CLIMATE_SENSOR_COUNT,
{"h": 4, "w": 8, "x": 0, "y": 12},
unit="none",
decimals=0,
instant=True,
)
climate_panel["description"] = "Reserved for tent climate telemetry wiring."
panels.append(climate_panel)
panels.append(
stat_panel(
8,
"Max Tent Temperature",
CLIMATE_TEMP_MAX,
{"h": 4, "w": 8, "x": 8, "y": 12},
unit="celsius",
decimals=1,
instant=True,
)
)
panels.append(
stat_panel(
9,
"Max Tent Humidity",
CLIMATE_HUMIDITY_MAX,
{"h": 4, "w": 8, "x": 16, "y": 12},
unit="percent",
decimals=1,
instant=True,
)
)
return {
"uid": "atlas-power",
"title": "Atlas Power",
"folderUid": PRIVATE_FOLDER,
"editable": True,
"panels": panels,
"time": {"from": "now-24h", "to": "now"},
"annotations": {"list": []},
"schemaVersion": 39,
"style": "dark",
"tags": ["atlas", "power", "climate"],
}
def build_gpu_dashboard():
panels = []
gpu_scope = "$namespace_scope_gpu"
@ -2792,6 +2986,10 @@ DASHBOARDS = {
"builder": build_jobs_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-jobs.yaml",
},
"atlas-power": {
"builder": build_power_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-power.yaml",
},
"atlas-gpu": {
"builder": build_gpu_dashboard,
"configmap": ROOT / "services" / "monitoring" / "grafana-dashboard-gpu.yaml",

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,553 @@
{
"uid": "atlas-power",
"title": "Atlas Power",
"folderUid": "atlas-internal",
"editable": true,
"panels": [
{
"id": 1,
"type": "stat",
"title": "UPS Sources On Battery",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(hecate_ups_on_battery) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 2
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 2,
"type": "stat",
"title": "UPS Sources Low Battery",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "sum(hecate_ups_low_battery) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 2
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 3,
"type": "stat",
"title": "Lowest Runtime Remaining",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "min(hecate_ups_runtime_seconds) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 600
},
{
"color": "yellow",
"value": 1200
},
{
"color": "green",
"value": 1800
}
]
},
"unit": "s",
"custom": {
"displayMode": "auto"
},
"decimals": 0
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Runtime Headroom",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "100 * min(hecate_ups_runtime_seconds) / clamp_min(max(hecate_ups_threshold_seconds), 1)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 100
},
{
"color": "yellow",
"value": 110
},
{
"color": "green",
"value": 140
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "timeseries",
"title": "UPS Runtime by Source",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "hecate_ups_runtime_seconds",
"refId": "A",
"legendFormat": "{{instance}}/{{source}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 6,
"type": "timeseries",
"title": "UPS Trigger Activity by Source",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 4
},
"targets": [
{
"expr": "hecate_ups_trigger_active",
"refId": "A",
"legendFormat": "{{instance}}/{{source}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "none"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 7,
"type": "stat",
"title": "Climate Sensors Reporting",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 12
},
"targets": [
{
"expr": "count(atlas_climate_temperature_celsius) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 0
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"description": "Reserved for tent climate telemetry wiring."
},
{
"id": 8,
"type": "stat",
"title": "Max Tent Temperature",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 12
},
"targets": [
{
"expr": "max(atlas_climate_temperature_celsius) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "celsius",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 9,
"type": "stat",
"title": "Max Tent Humidity",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 12
},
"targets": [
{
"expr": "max(atlas_climate_humidity_percent) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
}
],
"time": {
"from": "now-24h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"power",
"climate"
]
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,562 @@
# services/monitoring/grafana-dashboard-power.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-power
labels:
grafana_dashboard: "1"
data:
atlas-power.json: |
{
"uid": "atlas-power",
"title": "Atlas Power",
"folderUid": "atlas-internal",
"editable": true,
"panels": [
{
"id": 1,
"type": "stat",
"title": "UPS Sources On Battery",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
},
"targets": [
{
"expr": "sum(hecate_ups_on_battery) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 2
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 2,
"type": "stat",
"title": "UPS Sources Low Battery",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
},
"targets": [
{
"expr": "sum(hecate_ups_low_battery) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 2
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
}
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 3,
"type": "stat",
"title": "Lowest Runtime Remaining",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"targets": [
{
"expr": "min(hecate_ups_runtime_seconds) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 600
},
{
"color": "yellow",
"value": 1200
},
{
"color": "green",
"value": 1800
}
]
},
"unit": "s",
"custom": {
"displayMode": "auto"
},
"decimals": 0
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 4,
"type": "stat",
"title": "Runtime Headroom",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"targets": [
{
"expr": "100 * min(hecate_ups_runtime_seconds) / clamp_min(max(hecate_ups_threshold_seconds), 1)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "orange",
"value": 100
},
{
"color": "yellow",
"value": 110
},
{
"color": "green",
"value": 140
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 5,
"type": "timeseries",
"title": "UPS Runtime by Source",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 4
},
"targets": [
{
"expr": "hecate_ups_runtime_seconds",
"refId": "A",
"legendFormat": "{{instance}}/{{source}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 6,
"type": "timeseries",
"title": "UPS Trigger Activity by Source",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 4
},
"targets": [
{
"expr": "hecate_ups_trigger_active",
"refId": "A",
"legendFormat": "{{instance}}/{{source}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "none"
},
"overrides": []
},
"options": {
"legend": {
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi"
}
}
},
{
"id": 7,
"type": "stat",
"title": "Climate Sensors Reporting",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 12
},
"targets": [
{
"expr": "count(atlas_climate_temperature_celsius) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "none",
"custom": {
"displayMode": "auto"
},
"decimals": 0
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
},
"description": "Reserved for tent climate telemetry wiring."
},
{
"id": 8,
"type": "stat",
"title": "Max Tent Temperature",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 8,
"y": 12
},
"targets": [
{
"expr": "max(atlas_climate_temperature_celsius) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "celsius",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
},
{
"id": 9,
"type": "stat",
"title": "Max Tent Humidity",
"datasource": {
"type": "prometheus",
"uid": "atlas-vm"
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 12
},
"targets": [
{
"expr": "max(atlas_climate_humidity_percent) or on() vector(0)",
"refId": "A",
"instant": true
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(115, 115, 115, 1)",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"unit": "percent",
"custom": {
"displayMode": "auto"
},
"decimals": 1
},
"overrides": []
},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value"
}
}
],
"time": {
"from": "now-24h",
"to": "now"
},
"annotations": {
"list": []
},
"schemaVersion": 39,
"style": "dark",
"tags": [
"atlas",
"power",
"climate"
]
}

View File

@ -224,6 +224,16 @@ spec:
target_label: instance
replacement: titan-jh
# --- Hecate power telemetry (host-level daemon on UPS hosts) ---
- job_name: "hecate-power"
static_configs:
- targets: ["192.168.22.10:9560"]
labels:
instance: titan-db
- targets: ["192.168.22.26:9560"]
labels:
instance: titan-24
# --- cert-manager (pods expose on 9402) ---
- job_name: "cert-manager"
kubernetes_sd_configs: [{ role: pod }]

View File

@ -13,6 +13,7 @@ resources:
- grafana-dashboard-storage.yaml
- grafana-dashboard-network.yaml
- grafana-dashboard-gpu.yaml
- grafana-dashboard-power.yaml
- grafana-dashboard-mail.yaml
- grafana-dashboard-jobs.yaml
- dcgm-exporter.yaml